#**Step 1: Install All the Required Packages**

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m194.6/244.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m119.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

#**Step 2: Import All the Required Libraries**

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


#We will reformat our instruction dataset to follow Llama 2’s template.

We have used Alpaca format introduced by Stanford to prepare dataset.

For more info visit this link : https://github.com/tatsu-lab/stanford_alpaca


#To drastically reduce the VRAM usage, we must fine-tune the model in 4-bit precision, which is why we’ll use QLoRA here.

#**Step 3**

QLoRA will use a rank of 64 with a scaling parameter of 16. We’ll load the Llama 2 model directly in 4-bit precision using the NF4 type and train it for one epoch

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
#dataset_name = "ekshat/text-2-sql-with-context"

# Fine-tuned model name
new_model = "/content/Llama2"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "/content/results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 8

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

#**Step 4:Load everything and start the fine-tuning process**

1. First of all, we want to load the dataset we defined. Here, our dataset is already preprocessed but, usually, this is where you would reformat the prompt, filter out bad text, combine multiple datasets, etc.


2. Then, we’re configuring bitsandbytes for 4-bit quantization.


3. Next, we're loading the Llama 2 model in 4-bit precision on a GPU with the corresponding tokenizer.


4. Finally, we're loading configurations for QLoRA, regular training parameters, and passing everything to the SFTTrainer. The training can finally start!

In [None]:
import pandas as pd

df_train_ =pd.read_csv("/content/dataset_train.csv")
df_train_.head()

Unnamed: 0,text
0,as a Who was the candidate in the election in ...
1,"as a What is 17th c., when Initial-Syllable Op..."
2,as a Name the regular season for final playoff...
3,"as a If the season is before 2000, the runner ..."
4,as a When the Away team scored 20.15 (135) on ...


In [None]:
train_file = "/content/dataset_train.csv"
# Get the datasets

from datasets import load_dataset

data_files = {}
dataset_args = {}
validation_split_percentage = 5
extension = "csv"
data_files = {
    "train": train_file,
}

raw_datasets = load_dataset(
    extension,
    sep=";",
    data_files=data_files
)

raw_datasets["validation"] = load_dataset(
    extension,
    sep=";",
    data_files=data_files,
    split=f"train[:{validation_split_percentage}%]",
    **dataset_args,
)

raw_datasets["train"] = load_dataset(
    extension,
    sep=";",
    data_files=data_files,
    split=f"train[{validation_split_percentage}%:]",
    **dataset_args,
)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 61759
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3250
    })
})

In [None]:
raw_datasets['train']['text'][1:10]

['as a what character did Masaharu satou play, SELECT Character Name FROM table WHERE Voice Actor (Japanese) = masaharu satou<|endoftext|>',
 'as a What record has  c. j. miles (20) in the high points?, SELECT Record FROM table WHERE High points = C. J. Miles (20)<|endoftext|>',
 'as a Who composed the work conducted by jaroslav Vogel? , SELECT Composer FROM table WHERE Conductor = Jaroslav Vogel<|endoftext|>',
 "as a What's the TFR for the period with NC of 13.4?, SELECT TFR* FROM table WHERE NC* = 13.4<|endoftext|>",
 'as a how many times is the model ge40lfr?, SELECT COUNT Manufacturer FROM table WHERE Model = GE40LFR<|endoftext|>',
 'as a What is the average dismissals of 83 test and catches less than 33?, SELECT AVG Total Dismissals FROM table WHERE Tests = 83 AND Catches < 33<|endoftext|>',
 'as a Name the number of score for sacramento, SELECT COUNT Score FROM table WHERE Team = Sacramento<|endoftext|>',
 'as a What is the Tyre for the united states grand prix?, SELECT Tyre FROM

In [None]:
raw_datasets['validation']['text'][1:10]

['as a What is 17th c., when Initial-Syllable Open/Semi-Open Unstressed Vowels is "o /ɵ/"?, SELECT 17th c. FROM table WHERE Initial-syllable open/semi-open unstressed vowels = o /ɵ/<|endoftext|>',
 'as a Name the regular season for final playoffs, SELECT Regular Season FROM table WHERE Playoffs = Final<|endoftext|>',
 "as a If the season is before 2000, the runner up was north melbourne, and it's the pre-season cup, what's the sum of attendees?, SELECT SUM Attendance FROM table WHERE Premiership = pre-season cup AND Runner Up = north melbourne AND Season < 2000<|endoftext|>",
 'as a When the Away team scored 20.15 (135) on the Date of 23 april 1973, how many people were in the crowd?, SELECT Crowd FROM table WHERE Date = 23 april 1973 AND Away team score = 20.15 (135)<|endoftext|>',
 'as a Which From club had a Transfer fee of £3.87m?, SELECT From club FROM table WHERE Transfer fee = £3.87m<|endoftext|>',
 'as a What are the names of the episodes that airs at 2:00pm?, SELECT 02:00 PM F

In [None]:
# Load dataset (you can process it here)
#from datasets import Dataset
#dataset = Dataset.load_from_disk(dataset_name)
#from datasets import load_dataset
#datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
#dataset = load_dataset(dataset_name)

In [None]:
dataset =raw_datasets
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 61759
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3250
    })
})

In [None]:
dataset['train']['text']

['as a Can you tell me the total number of Gain that has the Name of williams, jonathan, and the Loss larger than 3?, SELECT COUNT Gain FROM table WHERE Name = williams, jonathan AND Loss > 3<|endoftext|>',
 'as a what character did Masaharu satou play, SELECT Character Name FROM table WHERE Voice Actor (Japanese) = masaharu satou<|endoftext|>',
 'as a What record has  c. j. miles (20) in the high points?, SELECT Record FROM table WHERE High points = C. J. Miles (20)<|endoftext|>',
 'as a Who composed the work conducted by jaroslav Vogel? , SELECT Composer FROM table WHERE Conductor = Jaroslav Vogel<|endoftext|>',
 "as a What's the TFR for the period with NC of 13.4?, SELECT TFR* FROM table WHERE NC* = 13.4<|endoftext|>",
 'as a how many times is the model ge40lfr?, SELECT COUNT Manufacturer FROM table WHERE Model = GE40LFR<|endoftext|>',
 'as a What is the average dismissals of 83 test and catches less than 33?, SELECT AVG Total Dismissals FROM table WHERE Tests = 83 AND Catches < 33<

In [None]:
#from sklearn.model_selection import train_test_split

#dataset = dataset['train'].train_test_split(test_size=0.08)


In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)


(…)ma-2-7b-chat-hf/resolve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

(…)esolve/main/model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

(…)t-hf/resolve/main/generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

(…)at-hf/resolve/main/tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

(…)2-7b-chat-hf/resolve/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

(…)b-chat-hf/resolve/main/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

(…)-hf/resolve/main/special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)



Map:   0%|          | 0/61759 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,3.7294
50,4.1602
75,2.7994
100,2.1862
125,1.6756
150,1.3903
175,1.4709
200,1.327
225,1.4134
250,1.2695


##**Step 5: Check the plots on tensorboard, as follows**

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/runs

#**Step 6:Use the text generation pipeline to ask questions like “What is a large language model?” Note that I’m formatting the input to match Llama 2’s prompt template.**

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

prompt = "as a List the season above 241.0 that was handled by brad tanenbaum."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=100)
result = pipe(prompt)
#print(result[0]['generated_text'])

print(result[0]['generated_text'].split("<|endoftext|>")[0])



as a List the season above 241.0 that was handled by brad tanenbaum., SELECT Season FROM table WHERE Handled by = brad tanenbaum AND Season above 241.0


In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

#**Step 7: Store New Llama2 Model (Llama-2-7b-chat-finetune)**

How can we store our new Llama-2-7b-chat-finetune model now? We need to merge the weights from LoRA with the base model. Unfortunately, as far as I know, there is no straightforward way to do it: we need to reload the base model in FP16 precision and use the peft library to merge everything.

In [None]:
# Reload model in FP16 and merge it with LoRA weights
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"
# Fine-tuned model name
new_model = "/content/Llama2_text-to-sql"

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

In [None]:
# Reload tokenizer to save it
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

prompt = "as a What's the lowest bronze with a 6 rank, smaller than 5 gold, and a total of more than 1?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
#print(result[0]['generated_text'])
print(result[0]['generated_text'].split("<|endoftext|>")[0])

as a What's the lowest bronze with a 6 rank, smaller than 5 gold, and a total of more than 1? SELECT MIN Bronze FROM table WHERE Rank = 6 AND Gold < 5 AND Total > 1


In [None]:
# Empty VRAM
del model
del pipe
import gc
gc.collect()
gc.collect()

0

#**Step 8: Push Model to Hugging Face Hub**

Our weights are merged and we reloaded the tokenizer. We can now push everything to the Hugging Face Hub to save our model.

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!huggingface-cli login

model.push_to_hub("ekshat/Llama-2-7b-chat-finetune-for-text2sql", check_pr=True)

tokenizer.push_to_hub("ekshat/Llama-2-7b-chat-finetune-for-text2sql",check_pr=True)

In [None]:
!pip install -q -U accelerate==0.23.0 peft==0.5.0 bitsandbytes==0.41.1 transformers==4.31 trl==0.7.2

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/258.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m143.4/258.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m116.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.0/124.0 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m120.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [None]:
import pandas as pd

filename = "/content/all-data.csv"

df = pd.read_csv(filename,
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

df.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [None]:
X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    print("11111111111", sentiment, "......", df[df.sentiment==sentiment])
    train, test  = train_test_split(df[df.sentiment==sentiment],
                                    train_size=300,
                                    test_size=300,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

11111111111 positive ......      sentiment                                               text
3     positive  With the new production plant the company woul...
4     positive  According to the company 's updated strategy f...
5     positive  FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
6     positive  For the last quarter of 2010 , Componenta 's n...
7     positive  In the third quarter of 2010 , net sales incre...
...        ...                                                ...
4775  positive  The apartment block will be well-located , in ...
4780  positive  The antibody , given at repeated doses of up t...
4786  positive  Danske Bank A-S DANSKE DC jumped 3.7 percent t...
4787  positive  Our superior customer centricity and expertise...
4822  positive  The 2015 target for net sales has been set at ...

[1363 rows x 2 columns]
11111111111 neutral ......      sentiment                                               text
0      neutral  According to Gran , the company has no plans t

In [None]:
X_train, X_test

(     sentiment                                               text
 3683   neutral  Mr Jortikka is president of the base metal div...
 163   positive  Both operating profit and net sales for the 12...
 4017  negative  Finnish automation solutions developer Cencorp...
 1588  positive  Renzo Piano 's building design will be a wonde...
 1799  positive  `` We are proud to contribute to the creation ...
 ...        ...                                                ...
 1374   neutral  The dividend will be paid on April 15 , 2008 t...
 3869   neutral  The new shares entitle their holders to divide...
 2766   neutral  Activities range from the development of natur...
 1798  positive  According to Bosse , the present cooperation i...
 243   positive  Operating profit rose to 22.1 mln eur from 19....
 
 [900 rows x 2 columns],
      sentiment                                               text
 567   positive  The new agreement , which expands a long-estab...
 1752  positive  ( ADP News ) - Fin

In [None]:
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)
X_train

Unnamed: 0,sentiment,text
0,neutral,Mr Jortikka is president of the base metal div...
1,positive,Both operating profit and net sales for the 12...
2,negative,Finnish automation solutions developer Cencorp...
3,positive,Renzo Piano 's building design will be a wonde...
4,positive,`` We are proud to contribute to the creation ...
...,...,...
895,neutral,"The dividend will be paid on April 15 , 2008 t..."
896,neutral,The new shares entitle their holders to divide...
897,neutral,Activities range from the development of natur...
898,positive,"According to Bosse , the present cooperation i..."


In [None]:
def generate_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = {data_point["sentiment"]}
            """.strip()
def generate_test_prompt(data_point):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets,
            determine if it is positive, neutral, or negative, and return the answer as
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{data_point["text"]}] = """.strip()



X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1),
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1),
                      columns=["text"])

y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [None]:
X_eval['text'][972]


'Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or "negative".\n\n            [Thanks to the multiplying effect of wagon performance , transport will be much more efficient , \'\' says development manager Juha Malkia from VR Cargo .] = positive'

In [None]:
X_train['text'][1]

'Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or "negative".\n\n            [Both operating profit and net sales for the 12-month period increased , respectively from EUR21 .5 m and EUR196 .1 m , as compared to 2005 .] = positive'

In [None]:
y_true

567     positive
1752    positive
995     positive
601     positive
568     positive
          ...   
4219    negative
4814    negative
4059    negative
4720    negative
4453    negative
Name: sentiment, Length: 900, dtype: object

In [None]:
X_test['text'][2]

'Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or "negative".\n\n            [The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .] ='

In [None]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)

    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [None]:
model_name = "NousResearch/Llama-2-7b-hf"
#model_name="meta-llama/Llama-2-7b-hf"
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(X_test))):
        prompt = X_test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens = 1,
                        temperature = 0.0,
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "positive" in answer:
            y_pred.append("positive")
        elif "negative" in answer:
            y_pred.append("negative")
        elif "neutral" in answer:
            y_pred.append("neutral")
        else:
            y_pred.append("none")
    return y_pred

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=1024,
)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [None]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("trained-model")

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,0.8208,0.733769


In [None]:
y_pred = predict(test, model, tokenizer)
evaluate(y_true, y_pred)

  0%|          | 0/900 [00:00<?, ?it/s]Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
100%|██████████| 900/900 [03:20<00:00,  4.49it/s]

Accuracy: 0.782
Accuracy for label 0: 0.983
Accuracy for label 1: 0.507
Accuracy for label 2: 0.857

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.98      0.89       300
           1       0.83      0.51      0.63       300
           2       0.72      0.86      0.78       300

    accuracy                           0.78       900
   macro avg       0.79      0.78      0.77       900
weighted avg       0.79      0.78      0.77       900


Confusion Matrix:
[[295   3   2]
 [ 51 152  97]
 [ 15  28 257]]





In [None]:
evaluation = pd.DataFrame({'text': X_test["text"],
                           'y_true':y_true,
                           'y_pred': y_pred},
                         )
evaluation.to_csv("test_predictions.csv", index=False)

In [None]:
X_test['text'][2]

'Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or "negative".\n\n            [The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .] ='

In [None]:
X_train['text'][1]

'Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or "negative".\n\n            [Both operating profit and net sales for the 12-month period increased , respectively from EUR21 .5 m and EUR196 .1 m , as compared to 2005 .] = positive'

In [None]:
test1 ='Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or "negative".\n\n            [Both operating profit and net sales for the 12-month period increased , respectively from EUR21 .5 m and EUR196 .1 m , as compared to 2005 .] = '
y_pred = []
prompt = test1
pipe = pipeline(task="text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens = 1,
                temperature = 0.0,
                )
result = pipe(prompt)
#answer = result[0]['generated_text'].split("=")[-1]
answer = result[0]['generated_text']
answer

Input length of input_ids is 107, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


'Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or "negative".\n\n            [Both operating profit and net sales for the 12-month period increased , respectively from EUR21 .5 m and EUR196 .1 m , as compared to 2005 .] =  positive'

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Ruta del modelo guardado en el dataset de Kaggle
from peft import LoraConfig, PeftModel

device_map = {"": 0}
PEFT_MODEL = "/content/trained-model"
#model_name = "NousResearch/Llama-2-7b-hf"

# Cargar la configuración del modelo
config = PeftConfig.from_pretrained(PEFT_MODEL)

# Cargar el modelo
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    low_cpu_mem_usage=True,
    return_dict=True,
    #quantization_config=bnb_config,
    device_map="auto",
    #trust_remote_code=True,
    torch_dtype=torch.float16,
)

# Cargar el tokenizador
tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

# Cargar el modelo PEFT
load_model = PeftModel.from_pretrained(model, PEFT_MODEL)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
test1 ='Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or "negative".\n\n            [Both operating profit and net sales for the 12-month period increased , respectively from EUR21 .5 m and EUR196 .1 m , as compared to 2005 .] = '
y_pred = []
prompt = test1
pipe = pipeline(task="text-generation",
                model=load_model,
                tokenizer=tokenizer,
                max_new_tokens = 1,
                temperature = 0.0,
                )
result = pipe(prompt)
#answer = result[0]['generated_text'].split("=")[-1]
answer = result[0]['generated_text']
answer

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

'Analyze the sentiment of the news headline enclosed in square brackets, \n            determine if it is positive, neutral, or negative, and return the answer as \n            the corresponding sentiment label "positive" or "neutral" or "negative".\n\n            [Both operating profit and net sales for the 12-month period increased , respectively from EUR21 .5 m and EUR196 .1 m , as compared to 2005 .] =  positive'

In [None]:
#!rm -rf /content/logs_keywords

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [None]:
df =pd.read_csv("/content/dataset_all.csv")
df =df.iloc[0:1500,:]
df.head()
print(len(df))

1500


In [None]:
from sklearn.model_selection import train_test_split
# Split the data into training, testing, and validation sets
# First, split into train and temp sets (80% train and 20% temp)
X_train, X_temp = train_test_split(df, test_size=0.2, random_state=42)

# Then, split the temp set into test and validation sets (50% test and 50% validation)
X_test, X_eval = train_test_split(X_temp, test_size=0.5, random_state=42)

In [None]:
X_train

Unnamed: 0,keywords,description
382,project manager,"worked with Cyber Security, solutions, storage..."
538,frontend developer,revamped UI and app interface for a web applic...
1493,frontend developer,"built Single Page Applications (SPA), Responsi..."
1112,python developer,designed and developed a horizontally scalable...
324,java developer,"worked extensively on Core Java, low latency s..."
...,...,...
1130,python developer,used embedded python to interface code like co...
1294,database administrator,worked on Oracle 12c New Features such as reco...
860,systems administrator,designed and implemented call center equipment...
1459,frontend developer,"worked with the Management, Development and Qu..."


In [None]:
def generate_prompt(data_point):
    return f"""
            Analyze the keywords enclosed in square brackets,
            determine description if it is belong to keywords, and return the answer as
            the corresponding description.

            [{data_point["keywords"]}] = {data_point["description"]}
            """.strip()
def generate_test_prompt(data_point):
    return f"""
            Analyze the keywords enclosed in square brackets,
            determine description if it is belong to keywords, and return the answer as
            the corresponding description.

            [{data_point["keywords"]}] = """.strip()



X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1),
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1),
                      columns=["text"])

y_true = X_test.description
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [None]:
X_train['text'][1]

'Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [database administrator] = designed and built an HA DBaaS platform using MySQL and Galera.'

In [None]:
X_test['text'][1161]

'Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [java developer] ='

In [None]:
y_true[1161]

'implemented ORM with HIBERNATE to make the Persistence class objects interact with numerous SQL Server tables spanned across various schemas as per MODOC standards.'

In [None]:
X_eval['text'][590]

'Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [backend developer] = implemented React.js with Redux pattern for component-driven web development.'

In [None]:
#!rm -rf /content/logs_keywords

In [None]:
model_name = "NousResearch/Llama-2-7b-hf"
#model_name="meta-llama/Llama-2-7b-hf"
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir="results_keywords",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=1024,
)

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [None]:
# Train model
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.8124,1.068716


TrainOutput(global_step=150, training_loss=1.2784169514973958, metrics={'train_runtime': 344.7747, 'train_samples_per_second': 3.481, 'train_steps_per_second': 0.435, 'total_flos': 1584878096547840.0, 'train_loss': 1.2784169514973958, 'epoch': 1.0})

In [None]:
# Save trained model
trainer.model.save_pretrained("Llama2_keywords")

In [None]:
#'Analyze the keywords enclosed in square brackets, determine description if it is belong to keywords, and return the answer as the corresponding description. [database administrator] = designed and built an HA DBaaS platform using MySQL and Galera.'
test1 ='Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [database administrator] ='
test2 ='Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [backend developer] ='
test3 ='Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [frontend developer] ='
test4 ='Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [python developer] ='

prompt = test4
pipe = pipeline(task="text-generation",
                model=model,
                tokenizer=tokenizer,
                #max_length =20,
                max_new_tokens =20,
                temperature = 0.0,
                )
result = pipe(prompt)
#answer1 = " ".join(result[0]['generated_text'].split("=")[:-1]).strip()
#print(answer1)
answer2 = result[0]['generated_text']
print(answer2)

Analyze the keywords enclosed in square brackets,
            determine description if it is belong to keywords, and return the answer as
            the corresponding description.

            [python developer] = used Python to create a web crawler to scrape data from the web.

            [


In [None]:
answerx = " ".join(result[0]['generated_text'].split(".")[:-1])
answerx

'Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description \n\n            [python developer] = used Python to create a web crawler to scrape data from the web'

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Ruta del modelo guardado en el dataset de Kaggle
from peft import LoraConfig, PeftModel

device_map = {"": 0}
PEFT_MODEL = "/content/Llama2_keywords"
#model_name = "NousResearch/Llama-2-7b-hf"

# Cargar la configuración del modelo
config = PeftConfig.from_pretrained(PEFT_MODEL)

# Cargar el modelo
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    low_cpu_mem_usage=True,
    return_dict=True,
    #quantization_config=bnb_config,
    device_map="auto",
    #trust_remote_code=True,
    torch_dtype=torch.float16,
)

# Cargar el tokenizador
tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

# Cargar el modelo PEFT
load_model = PeftModel.from_pretrained(model, PEFT_MODEL)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
test1 ='Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [database administrator] ='
test2 ='Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [backend developer] ='
test3 ='Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [frontend developer] ='
test4 ='Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [python developer] ='
test5='Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [java developer] ='
test6='Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [c# developer] ='
prompt_test = test1
pipe_test = pipeline(task="text-generation",
                model=load_model,
                tokenizer=tokenizer,
                max_new_tokens = 25,
                #temperature = 0.1,
                #max_length =50,
                )
result_test = pipe_test(prompt_test)
#answer = result[0]['generated_text'].split("=")[-1]
answer_test = result_test[0]['generated_text']
answer_test

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

'Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description.\n\n            [database administrator] = created and maintained the database backup and recovery plan.\n\n            [database administrator] = created and maintained the database backup and'

In [None]:
answery = " ".join(result_test[0]['generated_text'].split(".")[:-1])
answery

'Analyze the keywords enclosed in square brackets,\n            determine description if it is belong to keywords, and return the answer as\n            the corresponding description \n\n            [database administrator] = created and maintained the database backup and recovery plan'

In [None]:
l

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [None]:
from datasets import load_dataset
#datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
datasets = load_dataset("wikisql")
datasets

DatasetDict({
    test: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 15878
    })
    validation: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 8421
    })
    train: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 56355
    })
})

In [None]:
import pandas as pd
from datasets import load_dataset

# Load the 'wikisql' dataset
dataset = load_dataset("wikisql")

# Combine the dataset into a Pandas DataFrame
df = pd.DataFrame({
    'question': dataset['train']['question'],
    'sql': dataset['train']['sql'],
})

# Print the first few rows of the DataFrame
print(df.head())


                                            question  \
0    Tell me what the notes are for South Australia    
1  What is the current series where the new serie...   
2            What is the format for South Australia?   
3  Name the background colour for the Australian ...   
4      how many times is the fuel propulsion is cng?   

                                                 sql  
0  {'human_readable': 'SELECT Notes FROM table WH...  
1  {'human_readable': 'SELECT Current series FROM...  
2  {'human_readable': 'SELECT Format FROM table W...  
3  {'human_readable': 'SELECT Text/background col...  
4  {'human_readable': 'SELECT COUNT Fleet Series ...  


In [None]:
df['sql_cleaned']=df['sql'].apply(lambda x:x['human_readable'])
df.head()

Unnamed: 0,question,sql,sql_cleaned
0,Tell me what the notes are for South Australia,{'human_readable': 'SELECT Notes FROM table WH...,SELECT Notes FROM table WHERE Current slogan =...
1,What is the current series where the new serie...,{'human_readable': 'SELECT Current series FROM...,SELECT Current series FROM table WHERE Notes =...
2,What is the format for South Australia?,{'human_readable': 'SELECT Format FROM table W...,SELECT Format FROM table WHERE State/territory...
3,Name the background colour for the Australian ...,{'human_readable': 'SELECT Text/background col...,SELECT Text/background colour FROM table WHERE...
4,how many times is the fuel propulsion is cng?,{'human_readable': 'SELECT COUNT Fleet Series ...,SELECT COUNT Fleet Series (Quantity) FROM tabl...


In [None]:
df['sql_cleaned'][1]

'SELECT Current series FROM table WHERE Notes = New series began in June 2011'

In [None]:
#df =pd.read_csv("/content/dataset_all.csv")
df=df[["question","sql_cleaned"]]
df =df.iloc[0:1500,:]
print(len(df))
df.head()


1500


Unnamed: 0,question,sql_cleaned
0,Tell me what the notes are for South Australia,SELECT Notes FROM table WHERE Current slogan =...
1,What is the current series where the new serie...,SELECT Current series FROM table WHERE Notes =...
2,What is the format for South Australia?,SELECT Format FROM table WHERE State/territory...
3,Name the background colour for the Australian ...,SELECT Text/background colour FROM table WHERE...
4,how many times is the fuel propulsion is cng?,SELECT COUNT Fleet Series (Quantity) FROM tabl...


In [None]:
from sklearn.model_selection import train_test_split
# Split the data into training, testing, and validation sets
# First, split into train and temp sets (80% train and 20% temp)
X_train, X_temp = train_test_split(df, test_size=0.2, random_state=42)

# Then, split the temp set into test and validation sets (50% test and 50% validation)
X_test, X_eval = train_test_split(X_temp, test_size=0.5, random_state=42)

In [None]:
X_train

Unnamed: 0,question,sql_cleaned
382,How many records are there at the War Memorial...,SELECT COUNT Record FROM table WHERE Stadium =...
538,Which planet has an orbital period of 11.86 ye...,SELECT Planet FROM table WHERE Orbital Period ...
1493,What is the score of the game with the streak l5,SELECT Score FROM table WHERE Streak = L5
1112,Where was the GTE Suncoast Classic tournament ...,SELECT Location FROM table WHERE Tournament = ...
324,what's the minimum attendance with score 10.1...,SELECT MIN Attendance FROM table WHERE Score =...
...,...,...
1130,"Who wrote ""Stop Being all Funky""?","SELECT Written by FROM table WHERE Title = ""St..."
1294,"Who was the director for the title, ""funhouse""?","SELECT Director(s) FROM table WHERE Title = ""F..."
860,who is the grand finalist where scores is 11....,SELECT Grand Finalist FROM table WHERE Scores ...
1459,When did Chris Bosh (14) have the high rebounds?,SELECT Date FROM table WHERE High rebounds = C...


In [None]:
def generate_prompt(data_point):
    return f"""
            Analyze the sql question enclosed in square brackets,
            determine sql sytax if it is belong to sql, and return the answer as
            the corresponding sql sytax.

            [{data_point["question"]}] = {data_point["sql_cleaned"]}
            """.strip()
def generate_test_prompt(data_point):
    return f"""
            Analyze the sql question enclosed in square brackets,
            determine sql sytax if it is belong to sql, and return the answer as
            the corresponding sql sytax.

            [{data_point["question"]}] = """.strip()



X_train = pd.DataFrame(X_train.apply(generate_prompt, axis=1),
                       columns=["text"])
X_eval = pd.DataFrame(X_eval.apply(generate_prompt, axis=1),
                      columns=["text"])

y_true = X_test.sql_cleaned
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [None]:
X_train['text'][390]

"Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [What are the chances that player 2 wins if player 1's choice is BB R?] = SELECT Probability 2nd player wins FROM table WHERE 1st players choice = BB R"

In [None]:
X_eval['text'][464]

'Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [Who directed El Nido?] = SELECT Director FROM table WHERE Original title = El nido'

In [None]:
X_test['text'][899]

'Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [The Dijon-prenois had how many fastest laps?] ='

In [None]:
y_true[899]

'SELECT COUNT Fastest Lap FROM table WHERE Location = Dijon-Prenois'

In [None]:
model_name = "NousResearch/Llama-2-7b-hf"
#model_name="meta-llama/Llama-2-7b-hf"
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir="results_sql",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=1024,
)

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [None]:
# Train model
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.6367,0.676974


TrainOutput(global_step=150, training_loss=1.0056418736775716, metrics={'train_runtime': 360.6695, 'train_samples_per_second': 3.327, 'train_steps_per_second': 0.416, 'total_flos': 1917293332684800.0, 'train_loss': 1.0056418736775716, 'epoch': 1.0})

In [None]:
# Save trained model
trainer.model.save_pretrained("Llama2_sql")

In [None]:
#'Analyze the keywords enclosed in square brackets, determine description if it is belong to keywords, and return the answer as the corresponding description. [database administrator] = designed and built an HA DBaaS platform using MySQL and Galera.'
test1 ='Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [How many records are there at the War Memorial Stadium?] ='
test2 ='Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [The Dijon-prenois had how many fastest laps?] ='
test3 ='Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [In which stadium is the week 5 game played?] ='
test4 ="Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [What are the chances that player 2 wins if player 1's choice is BB R?] ="

prompt = test3
pipe = pipeline(task="text-generation",
                model=model,
                tokenizer=tokenizer,
                #max_length =20,
                max_new_tokens =25,
                temperature = 0.0,
                )
result = pipe(prompt)
#answer1 = " ".join(result[0]['generated_text'].split("=")[:-1]).strip()
#print(answer1)
answer2 = result[0]['generated_text']
print(answer2)

Analyze the sql question enclosed in square brackets,
            determine sql sytax if it is belong to sql, and return the answer as
            the corresponding sql sytax.

            [In which stadium is the week 5 game played?] = SELECT Stadium FROM table WHERE Week = 5

            [What is the date of the week 1 game?]


In [None]:
answerx = " ".join(result[0]['generated_text'].split("[")[:-1])
answerx

'Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n             In which stadium is the week 5 game played?] = SELECT Stadium FROM table WHERE Week = 5\n\n            '

In [None]:
#testing and loading model

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Ruta del modelo guardado en el dataset de Kaggle
from peft import LoraConfig, PeftModel

device_map = {"": 0}
PEFT_MODEL = "/content/Llama2_sql"
#model_name = "NousResearch/Llama-2-7b-hf"

# Cargar la configuración del modelo
config = PeftConfig.from_pretrained(PEFT_MODEL)

# Cargar el modelo
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    low_cpu_mem_usage=True,
    return_dict=True,
    #quantization_config=bnb_config,
    device_map="auto",
    #trust_remote_code=True,
    torch_dtype=torch.float16,
)

# Cargar el tokenizador
tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

# Cargar el modelo PEFT
load_model = PeftModel.from_pretrained(model, PEFT_MODEL)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
test1 ='Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [How many records are there at the War Memorial Stadium?] ='
test2 ='Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [The Dijon-prenois had how many fastest laps?] ='
test3 ='Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [In which stadium is the week 5 game played?] ='
test4 ="Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [What are the chances that player 2 wins if player 1's choice is BB R?] ="


prompt_test = test1
pipe_test = pipeline(task="text-generation",
                model=load_model,
                tokenizer=tokenizer,
                #max_length =20,
                max_new_tokens =25,
                temperature = 0.0,
                )
result_test = pipe_test(prompt_test)
#answer = result[0]['generated_text'].split("=")[-1]
answer_test = result_test[0]['generated_text']
answer_test

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

'Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n            [How many records are there at the War Memorial Stadium?] = SELECT COUNT FROM table WHERE Stadium = War Memorial Stadium\n\n            [What is the capacity of the War Memorial Stadium?]'

In [None]:
answery = " ".join(result_test[0]['generated_text'].split("[")[:-1])
answery

'Analyze the sql question enclosed in square brackets,\n            determine sql sytax if it is belong to sql, and return the answer as\n            the corresponding sql sytax.\n\n             How many records are there at the War Memorial Stadium?] = SELECT COUNT FROM table WHERE Stadium = War Memorial Stadium\n\n            '