# Fine-tuning + context-learning for a CodeLlama-2 Python model for Ops Data transform generations

## Installing and loading the libraries

In [1]:
!pip install "transformers" "datasets" "peft" "accelerate" "bitsandbytes" "trl" "safetensors>=0.3.1" "accelerate" --upgrade

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.6.2-py3-none-any.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m19.8

In [2]:
!pip install ipywidgets==7.7.1
!pip install huggingface_hub
!pip install python-dotenv
!pip install sentencepiece

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets==7.7.1)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.19.1
Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.0
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [3]:
from datasets import load_dataset
from random import randrange

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, CodeLlamaTokenizer, pipeline
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM

from trl import SFTTrainer



## Setting Global Parameters

In [4]:
# The model that you want to train from the Hugging Face hub
model_id = "codellama/CodeLlama-7b-Python-hf"
# The instruction dataset to use
dataset_name = "newbiettn/ops_transforms"
# Dataset split
dataset_split= "train"
# Fine-tuned model name
new_model = "CodeLlama-7b-Python-hf-fine-tuned"
# Huggingface repository
hf_model_repo="newbiettn/"+new_model
# Load the entire model on the GPU 0
device_map = {"": 0}

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_double_nested_quant = False

################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 64
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = new_model
# Number of training epochs
num_train_epochs = 200
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True
# Batch size per GPU for training
per_device_train_batch_size = 64
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1 # 2
# Enable gradient checkpointing
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4 #1e-5
# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001
# Optimizer to use
optim = "paged_adamw_32bit"
# Learning rate schedule
lr_scheduler_type = "cosine" #"constant"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = False
# Save checkpoint every X updates steps
save_steps = 0
# Log every X updates steps
logging_steps = 100
# Disable tqdm
disable_tqdm= True

################################################################################
# SFTTrainer parameters
################################################################################
# Maximum sequence length to use
max_seq_length = 2048 #None
# Pack multiple short examples in the same input sequence to increase efficiency
packing = True #False


## Connect to Huggingface Hub

Log in to Hugging Face Hub using token

In [5]:
from huggingface_hub import login

# Login to the Hugging Face Hub
login(token="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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Load the dataset with the instruction set

In [6]:
# Load dataset from the hub
dataset = load_dataset(dataset_name, split=dataset_split)
# Show dataset size
print(f"dataset size: {len(dataset)}")
# Show an example
print(dataset[randrange(len(dataset))])

Downloading readme:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

dataset size: 57
{'instruction': 'In Lexer OpsData Pyspark transform codes, Create get_validated_year_of_birth() function. This function takes a year column that is the length of 4 and accepts year later than 1900 until the current year. Cast the result as integer. It will return Null for invalid year', 'input': 'yob', 'output': 'def get_validated_year_of_birth(yob:Column) -> Column:\n  """This function is used to validate and format year of birth column.\n  """\n  yob = F.when(yob.rlike("\\d{4}"), yob).otherwise(F.lit(None))\n  yob = F.when((yob >= 1900) & (yob <= F.year(F.current_date())), yob).otherwise(F.lit(None))\n  yob = yob.cast("int")\n  return yob'}


In [7]:
# Check the dataset structure
dataset

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 57
})

In [8]:
# Show a random example
print(dataset[randrange(len(dataset))])

{'instruction': 'In Lexer OpsData Pyspark transform codes, Persist customer_source using filter_most_recent() function based on input column', 'input': 'customer_id', 'output': 'customer_persist = filter_most_recent(source= customer_source, \n                                  filter_cols= "customer_id",\n                                  date_pattern=r"\\/imports\\/(\\d{4}\\/\\d{2}\\/\\d{2})")'}


To fine-tune our model, we need to convert our structured examples into a collection of tasks described via instructions. We define a formatting_function that takes a sample and returns a string with our instruction format.

In [9]:
def format_instruction(sample):
	return f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
{sample['output']}
"""

In [10]:
# Set the instruction format for HuggingFaceH4/CodeAlpaca_20K
#def format_instruction(sample):
	#return f"""### Instruction:
#You are a coding assistant that will write a Solution to resolve the following Task:"

### Task:
#{sample['prompt']}

### Solution:
#{sample['completion']}
#"""

In [11]:
# Show a formatted instruction
print(format_instruction(dataset[randrange(len(dataset))]))


### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
In Lexer OpsData Pyspark transform codes, Cast input column as string. This replace the current input column

### Input:
age

### Response:
customer_persist = customer_persist.withColumn("age", F.col("age").cast("string"))



## Instruction fine-tune a Llama 2 model using trl and the SFTTrainer

We will use the recently introduced method in the paper "QLoRA: Quantization-aware Low-Rank Adapter Tuning for Language Generation" by Tim Dettmers et al. QLoRA is a new technique to reduce the memory footprint of large language models during finetuning, without sacrificing performance.

Quantize the pre-trained model to 4 bits and freeze it.
Attach small, trainable adapter layers. (LoRA)
Finetune only the adapter layers while using the frozen quantized model for context.

In [12]:
# Get the type
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_use_double_quant=use_double_nested_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype
)

In [13]:
# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, use_cache = False, device_map=device_map)
model.config.pretraining_tp = 1

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

(…)ma-7b-Python-hf/resolve/main/config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

(…)esolve/main/model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

(…)n-hf/resolve/main/generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

(…)on-hf/resolve/main/tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

(…)7b-Python-hf/resolve/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

(…)-hf/resolve/main/special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

The SFTTrainer supports a native integration with peft, which makes it super easy to efficiently instruction tune LLMs. We only need to create our LoRAConfig and provide it to the trainer.

In [14]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
)
# Not necessary when using SFTTrainer
# prepare model for training
# model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, peft_config)

In [15]:
!rm -rf sample_data

Before we can start our training we need to define the hyperparameters (TrainingArguments) we want to use

In [16]:
# Define the training arguments
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size, # 6 if use_flash_attention else 4,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    optim=optim,
    #save_steps=save_steps,
    logging_steps=logging_steps,
    save_strategy="epoch",
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    #max_steps=max_steps,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    disable_tqdm=disable_tqdm,
    report_to="tensorboard",
    seed=42
)

We now have every building block we need to create our SFTTrainer to start then training our model.

In [17]:
# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=packing,
    formatting_func=format_instruction,
    args=args,
)

Start training our model by calling the train() method on our Trainer instance.

In [18]:
# train
trainer.train() # there will not be a progress bar since tqdm is disabled

# save model in local
trainer.save_model()

You're using a CodeLlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.3705, 'learning_rate': 0.00010485622221144484, 'epoch': 100.0}




{'loss': 0.0069, 'learning_rate': 0.0, 'epoch': 200.0}
{'train_runtime': 1165.6244, 'train_samples_per_second': 9.78, 'train_steps_per_second': 0.172, 'train_loss': 0.18873168915510177, 'epoch': 200.0}


## Merge the model and the adapters and save it

When running in a T4 instance we have to clean the memory

In [73]:
# Empty VRAM
del model
del trainer
import gc
gc.collect()

0

In [71]:
torch.cuda.empty_cache() # PyTorch thing

In [21]:
!rm -rf merged_model

In [22]:
#gc.collect()

Reload the trained and saved model and merge it then we can save the whole model

In [23]:
from peft import AutoPeftModelForCausalLM

new_model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# Merge LoRA and base model
merged_model = new_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model", safe_serialization=True)
tokenizer.save_pretrained("merged_model")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('merged_model/tokenizer_config.json',
 'merged_model/special_tokens_map.json',
 'merged_model/tokenizer.model',
 'merged_model/added_tokens.json',
 'merged_model/tokenizer.json')

In [24]:
# push merged model to the hub
merged_model.push_to_hub(hf_model_repo)
tokenizer.push_to_hub(hf_model_repo)

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/newbiettn/CodeLlama-7b-Python-hf-fine-tuned/commit/6bb5490b0cfc470eb96b4c5fa74aab8e554898c9', commit_message='Upload tokenizer', commit_description='', oid='6bb5490b0cfc470eb96b4c5fa74aab8e554898c9', pr_url=None, pr_revision=None, pr_num=None)

## Test the merged model

It is time to check our model performance

In [25]:
sample = dataset[randrange(len(dataset))]

prompt = f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = merged_model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.5)

print(f"Prompt:\n{prompt}\n")
print(f"\nGenerated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"\nGround truth:\n{sample['output']}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Prompt:
### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
In Lexer OpsData Pyspark transform codes, Create map_gender() function to change M to Male and F to Female. This function takes gender column and map its value

### Input:
gender

### Response:



Generated instruction:
# Define udf
map_gender =udf(lambda x: "Male" if x=="M" else "Female" if x=="F" else x, StringType())


Ground truth:
def map_gender(gender):
    return when(gender == 'M', 'Male').when(gender == 'F', 'Female').otherwise(gender)


In [26]:
# Run an new inference
prompt = f"""### Instruction:
Use the Input below to create an instruction, which could have been used to generate the input using an LLM.

### Input:
{sample['output']}

### Response:
"""

print(f"Prompt:\n{sample['output']}\n")
print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"Ground truth:\n{sample['instruction']}")

Prompt:
def map_gender(gender):
    return when(gender == 'M', 'Male').when(gender == 'F', 'Female').otherwise(gender)

Generated instruction:
s function takes gender column and map its value

### Input:
gender

### Response:
# Define udf
map_gender =udf(lambda x: "Male" if x=="M" else "Female" if x=="F" else x, StringType())

Ground truth:
In Lexer OpsData Pyspark transform codes, Create map_gender() function to change M to Male and F to Female. This function takes gender column and map its value


## Load the model from the HF Hub and test it

---



Finally we download the created model from the hub and test it to make sure it works fine!

In [27]:
# If not defined
hf_model_repo='newbiettn/CodeLlama-7b-Python-hf-fine-tuned'

In [28]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(hf_model_repo)
# Load the model
model = AutoModelForCausalLM.from_pretrained(hf_model_repo, load_in_4bit=True, torch_dtype=torch.float16,
                                             device_map=device_map)
# Create an instruction
instruction="Load all SFTP JSON file that has a file pattern from the S3 bucket"
input="*_user.json"

prompt = f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the Task.

### Task:
{instruction}

### Input:
{input}

### Response:
"""
# Tokenize the input
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# Run the model to infere an output
outputs = model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.5)

# Print the result
print(f"Prompt:\n{prompt}\n")
print(f"Generated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")



(…)tuned/resolve/main/tokenizer_config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

(…)f-fine-tuned/resolve/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

(…)ine-tuned/resolve/main/added_tokens.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

(…)ned/resolve/main/special_tokens_map.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


(…)n-hf-fine-tuned/resolve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

(…)esolve/main/model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]



model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

OSError: ignored

## Context-learning on Ops-Data codes

---

In [34]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [35]:
!pip install "langchain" "faiss-cpu" "huggingface-hub" "sentence-transformers"

Collecting langchain
  Downloading langchain-0.0.336-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.2-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Down

In [38]:
import os
from langchain.document_loaders import TextLoader
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp, HuggingFacePipeline
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.callbacks.base import BaseCallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [40]:
from google.colab import drive
drive.mount('ngoctran', force_remount=True)

Mounted at ngoctran


In [42]:
root_dir = '/content/ngoctran/MyDrive/hackoween/codes/'
docs= []
for dirpath,dirnames, filenames in os.walk(root_dir):
  for file in filenames:
    try:
      print(file)
      loader=TextLoader(os.path.join(dirpath,file),encoding='utf-8')
      docs.extend(loader.load_and_split())
    except Exception as e:
      pass

text_splitter = CharacterTextSplitter(chunk_size=1000,chunk_overlap=0)
texts=text_splitter.split_documents(docs)

activations_backload.py
base_consumerviewau.py
schedules.py
forms_backload.py
base_consumerviewusa.py
generate_new_schedules.py
migrate.py
enrichment_functions.py
forgotten_customers_bulk_notebook.py
form_functions.py
dataset_functions.py
forgotten_customer_notebook.py
compare_dataflows.py
dataflow_functions.py
compare_build_index_runs.py
client_dataset_count.py
get_bk_schedules.py
check_schedules.py
get_ops_schedules.py
test_profiles.py
purge.py
process_activations.py
supply_transform.py
supply_enrichment.py
enrichment.py
alternating_least_squares.py
locality_sensitive_hashing.py
wide_view.py
enrichment.py
training.py
ltv_xgboost_training_12x.py
enrichment.py
enrichment_keyfix.py
deterministic_qa.py
deterministic.py
inference.py
benchmark_calendar_years.py
insights_delta_transform.py
benchmark_aggregations_export.py
benchmark_calendar_months.py
benchmark_calendar_weeks.py
general_validation.py
truncate_dataset.py
activations_backload.py
load_datasets_staging.py
forms_backload.py
attri



In [43]:
embeddings= HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
faiss_index = FAISS.from_documents(texts,embeddings)
faiss_index.save_local("/content/ngoctran/MyDrive/hackoween/index")

(…)f3d3c277d6e90027e55de9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

(…)7d6e90027e55de9125/1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(…)e2f80f3d3c277d6e90027e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

(…)f80f3d3c277d6e90027e55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

(…)de9125/config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

(…)d3c277d6e90027e55de9125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

(…)90027e55de9125/sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

(…)6e90027e55de9125/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

(…)f3d3c277d6e90027e55de9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)7d6e90027e55de9125/tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

(…)3d3c277d6e90027e55de9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

(…)e2f80f3d3c277d6e90027e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)80f3d3c277d6e90027e55de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [44]:
print("loading indexes")
faiss_index=FAISS.load_local("/content/ngoctran/MyDrive/hackoween/index",embeddings)
retriever = faiss_index.as_retriever()
retriever.search_kwargs['distance_metric']='cos'
retriever.search_kwargs['fetch_k']=100
retriever.search_kwargs['maximal_marginal_relevance']=True
print("index loaded")

loading indexes
index loaded


In [45]:
model = AutoModelForCausalLM.from_pretrained(hf_model_repo, load_in_4bit=True, torch_dtype=torch.float16,
                                             device_map=device_map)

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

(…)uned/resolve/main/generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [59]:
def format_instruction(sample):
	return f"""### Instruction:
  Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

  ### Task:
  {sample['instruction']}

  ### Input:
  {sample['input']}

  """

question = {
      "instruction": "You are a Pyspark expert. Write a Pyspark script to read all SFTP files *_hackoween.json from the S3 bucket",
      "input": "*_hackoween.json"
}
format_instruction(question)

'### Instruction:\n  Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:\n\n  ### Task:\n  You are a Pyspark expert. Write a Pyspark script to read all SFTP files *_hackoween.json from the S3 bucket\n\n  ### Input:\n  *_hackoween.json\n\n  '

In [46]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=2048,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15,
    batch_size=1,
)
llm = HuggingFacePipeline(pipeline=pipe)


In [81]:
qa = ConversationalRetrievalChain.from_llm(llm,retriever=retriever)
chat_history = []
question = {
      "instruction": "You are a Pyspark expert. Load all SFTP JSON file that has a file pattern from the S3 bucket. SFTP files means files that are always in /imports and the file pattern is always /imports/{year}/{month}/{date}/ ",
      "input": "*_user_data.json"
}
while(True):
    print("Enter a question: ")
    result = qa({"question":format_instruction(question),"chat_history":chat_history})
    chat_history.append((format_instruction(question),result['answer']))
    print(result['answer'])


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Enter a question: 


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




def load_sftp_files(config: HubSpotJobConfig) -> None:	
    df_list: List[DataFrame] = []
    if Config.load_style == "HISTORIC":
      df_list = [
          sc.read.format("csv").options(
              escape="\"", quote="\"").load(
                  f"s3://{Config.bucket}/imports/*/*/*/*_customer_data.csv",
                  header=True,
                  sep=",",
          ),
          sc.read.format("csv").options(
              escape="\"", quote="\"").load(
                  f"s3://{Config.bucket}/imports/*/*/*/*_order_data.csv",
                  header=True,
                  sep=",",
          ),
          sc.read.format("csv").options(
              escape="\"", quote="\"").load(
                  f"s3://{Config.bucket}/imports/*/*/*/*_transaction_data.csv",
                  header=True,
                  sep=",",
          ),
      ]
    else:      
      df_list = [
          sc.read.format("csv").options(
              escape="\"", quote="\"").load(
                  [


KeyboardInterrupt: ignored