In [30]:
%%capture

!pip install bitsandbytes accelerate peft trl

use_flash_attention2 = False

import torch
# Replace attention with flash attention 
if torch.cuda.get_device_capability()[0] >= 8:
    use_flash_attention2 = True

print(f"Using flash attention 2: {use_flash_attention2}")

if use_flash_attention2:
    !pip install flash-attn --no-build-isolation --upgrade

import torch
print(torch.cuda.is_available())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
import torch
from datasets import load_dataset, DatasetDict
from dotenv import load_dotenv
from random import seed
import os
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
import utils

# Set seed for reproducibility
seed(42)

class EnvironmentLoader:
    @staticmethod
    def load_env():
        load_dotenv()

class DatasetHandler:
    def __init__(self, data_path):
        self.data_path = data_path

    def load_and_split_dataset(self):
        dataset = load_dataset("json", data_files=self.data_path)
        train_test_split = dataset['train'].train_test_split(test_size=0.2)
        dataset_dict = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        return dataset_dict['train'], dataset_dict['test']

    @staticmethod
    def format_instruction(sample):
        return f"""
        Below is an instruction that describes a task, paired with an input that provides further context. 
        Write a response that appropriately completes the request.

        ### Instruction:
        {sample['Instruction']}

        ### Input:
        {sample['Input']}

        ### Response:
        {sample['Output']}
        """

class ModelManager:
    def __init__(self, model_id, use_flash_attention2, hf_token):
        self.model_id = model_id
        self.use_flash_attention2 = use_flash_attention2
        self.hf_token = hf_token
        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16 if use_flash_attention2 else torch.float16
        )
    
    def load_model_and_tokenizer(self):
        model = AutoModelForCausalLM.from_pretrained(
            self.model_id, 
            quantization_config=self.bnb_config, 
            use_cache=False, 
            device_map="auto",
            token=self.hf_token,  
            attn_implementation="flash_attention_2" if self.use_flash_attention2 else "sdpa"
        )
        model.config.pretraining_tp = 1

        tokenizer = AutoTokenizer.from_pretrained(
            self.model_id,
            token=self.hf_token
        )
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right"
        
        return model, tokenizer

    def save_model_and_tokenizer(self, model, tokenizer, save_directory):
        model.save_pretrained(save_directory)
        tokenizer.save_pretrained(save_directory)
    
    @staticmethod
    def prepare_for_training(model):
        return prepare_model_for_kbit_training(model)

class Trainer:
    def __init__(self, model, tokenizer, train_dataset, peft_config, use_flash_attention2, output_dir):
        self.model = model
        self.tokenizer = tokenizer
        self.train_dataset = train_dataset
        self.peft_config = peft_config
        self.args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=1,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,
            gradient_checkpointing=True,
            optim="paged_adamw_8bit",
            logging_steps=10,
            save_strategy="epoch",
            learning_rate=2e-4,
            bf16=use_flash_attention2,
            fp16=not use_flash_attention2,
            tf32=use_flash_attention2,
            max_grad_norm=0.3,
            warmup_steps=5,
            lr_scheduler_type="linear",
            disable_tqdm=False,
            report_to="none"
        )
        self.model = get_peft_model(self.model, self.peft_config)

    def train_model(self, format_instruction_func):
        trainer = SFTTrainer(
            model=self.model,
            train_dataset=self.train_dataset,
            peft_config=self.peft_config,
            max_seq_length=2048,
            tokenizer=self.tokenizer,
            packing=True,
            formatting_func=format_instruction_func, 
            args=self.args,
        )
        trainer.train()
        return trainer

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'dotenv'

In [32]:
os.environ["HF_TOKEN"]

'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'

In [6]:
EnvironmentLoader.load_env()
dataset_handler = DatasetHandler(data_path=utils.Variables.INSTRUCTION_DATASET_JSON_PATH)
train_dataset, test_dataset = dataset_handler.load_and_split_dataset()

new_test_dataset = []
for dict_ in test_dataset:
    dict_['Output'] = ''
    new_test_dataset.append(dict_)

model_manager = ModelManager(
    model_id="meta-llama/Meta-Llama-3-8B",
    use_flash_attention2=True,
    hf_token=os.environ["HF_TOKEN"]
)
model, tokenizer = model_manager.load_model_and_tokenizer()
model_manager.save_model_and_tokenizer(model, tokenizer, save_directory=utils.Variables.BASE_MODEL_PATH)
model = model_manager.prepare_for_training(model)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",
    ]
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    peft_config=peft_config,
    use_flash_attention2=True,
    output_dir=utils.Variables.FINE_TUNED_MODEL_PATH
)
trained_model = trainer.train_model(format_instruction_func=dataset_handler.format_instruction)
trained_model.save_model()

Loading checkpoint shards: 100%|██████████| 4/4 [00:16<00:00,  4.08s/it]

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Generating train split: 122 examples [00:00, 361.32 examples/s]
  return fn(*args, **kwargs)
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


NameError: name '_flash_supports_window_size' is not defined

In [35]:
os.getenv('HF_TOKEN')

'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'

In [34]:
from dotenv import load_dotenv

load_dotenv()

True

In [38]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os
from dotenv import load_dotenv
import time
from random import randrange, sample, seed

import torch
import os
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

seed(42)

from datasets import load_dataset
import utils
dataset = load_dataset("json", data_files=utils.Variables.INSTRUCTION_DATASET_JSON_PATH, split="train")

def format_instruction(sample):
	return f"""    
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{sample['Instruction']}

### Input:
{sample['Input']}

### Response:
{sample['Output']}
"""

from datasets import load_dataset, DatasetDict
import utils
# Load the entire dataset
dataset = load_dataset("json", data_files=utils.Variables.INSTRUCTION_DATASET_JSON_PATH)

# Split the dataset into training and testing sets
train_test_split = dataset['train'].train_test_split(test_size=0.2)

# Create a DatasetDict to hold the splits
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

# Now you have separate training and testing sets
train_dataset = dataset_dict['train']
test_dataset = dataset_dict['test']


new_test_dataset = []
for dict_ in test_dataset:
    dict_['Output'] = ''
    new_test_dataset.append(dict_)

# Load environment variables from .env file
load_dotenv()

# Hugging Face model id
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_id = "mistralai/Mistral-7B-v0.1"

# BitsAndBytesConfig int-4 config 
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if use_flash_attention2 else torch.float16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    use_cache=False, 
    device_map="auto",
    token="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",#os.environ["HF_TOKEN"],  # if model is gated like llama or mistral
    attn_implementation="flash_attention_2" if use_flash_attention2 else "sdpa"
)
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token=os.environ["HF_TOKEN"],  # if model is gated like llama or mistral
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Specify the directory where you want to save the model and tokenizer
save_directory = utils.Variables.BASE_MODEL_PATH

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)


# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj", 
            "up_proj", 
            "down_proj",
        ]
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)


args = TrainingArguments(
    output_dir=utils.Variables.FINE_TUNED_MODEL_PATH,
    num_train_epochs=1,
    per_device_train_batch_size=4,#6 if use_flash_attention2 else 2, # you can play with the batch size depending on your hardware
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=use_flash_attention2,
    fp16=not use_flash_attention2,
    tf32=use_flash_attention2,
    max_grad_norm=0.3,
    warmup_steps=5,
    lr_scheduler_type="linear",
    disable_tqdm=False,
    report_to="none"
)

model = get_peft_model(model, peft_config)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction, 
    args=args,
)

# train
trainer.train()

# save model
trainer.save_model()

ValueError: `rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}

In [10]:
!pip install --upgrade transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
!pip install --upgrade pip

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pip
  Downloading pip-24.1.2-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.1.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-24.1.2


In [13]:
!pip install -r requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting absl-py==2.1.0 (from -r requirements.txt (line 1))
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting accelerate==0.32.1 (from -r requirements.txt (line 2))
  Using cached accelerate-0.32.1-py3-none-any.whl.metadata (18 kB)
Collecting altair==5.3.0 (from -r requirements.txt (line 5))
  Using cached altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting annotated-types==0.7.0 (from -r requirements.txt (line 6))
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting anyio==4.4.0 (from -r requirements.txt (line 7))
  Using cached anyio-4.4.0-py3-none-any.whl.metadata (4.6 kB)
Collecting async-timeout==4.0.3 (from -r requirements.txt (line 9))
  Using cached async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting bert-score==0.3.13 (from -r requirements.txt (line 11))
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting bitsandbytes==0.43.1 (from -r requirements.txt (line 12))
  Using cache

# Evaluation

In [4]:
import pandas as pd
import re
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM
import time

def calculate_bleu_score(machine_results, reference_texts):
    bleu_score = corpus_bleu([[ref.split()] for ref in reference_texts], [gen.split() for gen in machine_results])
    print(f'BLEU Score: {bleu_score}')

def calculate_rouge_scores(generated_answers, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1, total_rouge2, total_rougeL = 0, 0, 0
    for gen, ref in zip(generated_answers, ground_truth):
        scores = scorer.score(gen, ref)
        total_rouge1 += scores['rouge1'].fmeasure
        total_rouge2 += scores['rouge2'].fmeasure
        total_rougeL += scores['rougeL'].fmeasure
    average_rouge1 = total_rouge1 / len(generated_answers)
    average_rouge2 = total_rouge2 / len(generated_answers)
    average_rougeL = total_rougeL / len(generated_answers)
    return {'average_rouge1':average_rouge1,
            'average_rouge2':average_rouge2,
            'average_rougeL':average_rougeL}
    print(f'Average ROUGE-1: {average_rouge1}')
    print(f'Average ROUGE-2: {average_rouge2}')
    print(f'Average ROUGE-L: {average_rougeL}')

class ModelHandler:

    def __init__(self):
        pass

    def loading_model(self, model_chosen='fine_tuned_model'):

        if model_chosen == 'fine_tuned_model':
            model_dir=utils.Variables.FINE_TUNED_MODEL_PATH
            self.model = AutoPeftModelForCausalLM.from_pretrained(
                model_dir,
                low_cpu_mem_usage=True,
                torch_dtype=torch.float16,
                load_in_4bit=True,
                )

        elif model_chosen == 'base_model':
            model_dir=utils.Variables.BASE_MODEL_PATH
            self.model = AutoModelForCausalLM.from_pretrained(
                model_dir,
                low_cpu_mem_usage=True,
                torch_dtype=torch.float16,
                load_in_4bit=True,
                )

        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)

    def ask_question(self, instruction, temperature=0.5, max_new_tokens = 1000):

        prompt = format_instruction(instruction)

        input_ids = self.tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

        start_time = time.time()
        with torch.inference_mode():
            outputs = self.model.generate(input_ids=input_ids, pad_token_id=self.tokenizer.eos_token_id, max_new_tokens=max_new_tokens, do_sample=True, top_p=0.5,temperature=temperature)
        end_time = time.time()

        total_time = end_time - start_time
        output_length = len(outputs[0])-len(input_ids[0])

        self.output = self.tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

        return self.output

    def parse_output(self):    

        # Split the text at the word "Response"
        parts = self.output.split("Response:", 1)

        # Check if "Response" is in the text and get the part after it
        if len(parts) > 1:
            response_text = parts[1].strip()
        else:
            response_text = ""

        return response_text

In [5]:
modelhandler_finetuned = ModelHandler()
modelhandler_finetuned.loading_model(model_chosen = 'fine_tuned_model')
ground_truths_list = []
finetuned_model_generated_answers = []

for triple_with_output, triple_without_output in zip(test_dataset, new_test_dataset):

    modelhandler_finetuned.ask_question(triple_without_output)
    finetuned_model_generated_answer = modelhandler_finetuned.parse_output()
    ground_truth = triple_with_output['Output']

    finetuned_model_generated_answers.append(finetuned_model_generated_answer)
    ground_truths_list.append(ground_truth)

    
rouge_score_fine_tuned_model = calculate_rouge_scores(finetuned_model_generated_answers,ground_truths_list)
rouge_score_fine_tuned_model

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.76s/it]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


{'average_rouge1': 0.39997816307812206,
 'average_rouge2': 0.2213826792342886,
 'average_rougeL': 0.33508922374837047}

In [6]:
modelhandler_base = ModelHandler()
modelhandler_base.loading_model(model_chosen = 'base_model')
base_model_generated_answers = []
ground_truths_list = []

for triple_with_output, triple_without_output in zip(test_dataset, new_test_dataset):

    print(rf'Running triple {triple_with_output}')

    modelhandler_base.ask_question(triple_without_output)
    base_model_generated_answer = modelhandler_base.parse_output()
    ground_truth = triple_with_output['Output']

    base_model_generated_answers.append(base_model_generated_answer)
    ground_truths_list.append(ground_truth)
    
rouge_score_base_model = calculate_rouge_scores(base_model_generated_answers,ground_truths_list)
rouge_score_base_model

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running triple {'Instruction': 'Describe the scope of social bias in NLP models, including LLMs.', 'Input': '"Various types of social biases in NLP models have been reported, including word vectors, MLMs, and now LLMs."', 'Output': '"Social bias is a widespread issue in NLP models, affecting word vectors, MLMs, and LLMs."'}
Running triple {'Instruction': 'Describe how adding context to the prompt can improve the reasoning performance of an LLM.', 'Input': '"In LLMs, adding context to the prompt can increase the ID (depending on how related is the context to the question)"', 'Output': '"Adding context to the prompt can increase the intrinsic dimension, leading to more complex input and improved reasoning performance of the model."'}
Running triple {'Instruction': 'Summarize the goal of model alignment in Large Language Model (LLM) research.', 'Input': '"The goal of aligning models with human values, known as alignment, is a driving force in current LLM research."', 'Output': '"The goal 

KeyboardInterrupt: 

In [9]:
json_path = '/workspace/llama3_8b_finetuning/data/arvix_instruction_dataset.json'
import pandas as pd
import json
json_path = '/workspace/llama3_8b_finetuning/data/arvix_instruction_dataset.json'

with open(json_path, 'r') as file:
    json_data = json.load(file)


df = pd.json_normalize(json_data)

df


Unnamed: 0,Instruction,Input,Output
0,Summarize the main contribution of LoRA-Guard ...,"""We introduce LoRA-Guard, a parameter-efficien...","""LoRA-Guard is a parameter-efficient method fo..."
1,Explain the limitation of existing model-based...,"""Existing model-based guardrails have not been...","""Existing model-based guardrails are not suita..."
2,Describe the advantage of LoRA-Guard over exis...,"""We show that LoRA-Guard outperforms existing ...","""LoRA-Guard achieves better performance than e..."
3,Describe the main idea behind LoRA-Guard.,"""LoRA-Guard uses a low-rank adapter on a backb...","""LoRA-Guard integrates a chat model and a guar..."
4,Explain the advantage of the dual path design ...,"""The dual path design of LoRA-Guard, based on ...","""LoRA-Guard's dual path design prevents perfor..."
...,...,...,...
2424,List the dataset name for the given translatio...,"""task1111 ted translation he it""","""TED""\n\n"
2425,Identify the source and target languages for t...,"""task1020 pib translation telugu oriya""","""Telugu-Oriya"""
2426,Identify the task type of the given dataset.,"""task1340 msr text compression""","""Sentence Compression""\n\n"
2427,List the datasets related to summarization tasks.,"""Table A1""","""xlsum, amazon and yelp summarization dataset,..."


In [10]:
df.to_csv('./csv_instructions')

In [19]:
df = pd.read_csv('./csv_instructions').rename(columns = {'Instruction':'instruction', 'Input':'input', 'Output':'output'})
df[['instruction','input','output']].to_csv('./csv_instructions',index=False)

In [24]:
df = pd.read_csv('./csv_instructions')
df = df[['instruction', 'output']].rename(columns={'instruction':'text', 'output':'target'})
df.to_csv('./csv_instructions2', index = False)
df


Unnamed: 0,text,target
0,Summarize the main contribution of LoRA-Guard ...,"""LoRA-Guard is a parameter-efficient method fo..."
1,Explain the limitation of existing model-based...,"""Existing model-based guardrails are not suita..."
2,Describe the advantage of LoRA-Guard over exis...,"""LoRA-Guard achieves better performance than e..."
3,Describe the main idea behind LoRA-Guard.,"""LoRA-Guard integrates a chat model and a guar..."
4,Explain the advantage of the dual path design ...,"""LoRA-Guard's dual path design prevents perfor..."
...,...,...
2424,List the dataset name for the given translatio...,"""TED""\n\n"
2425,Identify the source and target languages for t...,"""Telugu-Oriya"""
2426,Identify the task type of the given dataset.,"""Sentence Compression""\n\n"
2427,List the datasets related to summarization tasks.,"""xlsum, amazon and yelp summarization dataset,..."


In [28]:
df = pd.read_csv('./csv_instructions2')


df['conversation'] = 'human: ' + df['text'] + ' \n bot: ' + df['target']

# Dropping the original columns if needed
df = df[['conversation']]

df.to_csv('./instruction_single_column_csv', index=False)

In [29]:
pd.read_csv('instruction_single_column_csv')

Unnamed: 0,conversation
0,human: Summarize the main contribution of LoRA...
1,human: Explain the limitation of existing mode...
2,human: Describe the advantage of LoRA-Guard ov...
3,human: Describe the main idea behind LoRA-Guar...
4,human: Explain the advantage of the dual path ...
...,...
2424,human: List the dataset name for the given tra...
2425,human: Identify the source and target language...
2426,human: Identify the task type of the given dat...
2427,human: List the datasets related to summarizat...
