In [None]:
!pip install torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate einops tqdm scipy

In [None]:
import os
from dataclasses import dataclass, field
from typing import Optional
from huggingface_hub import interpreter_login
import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)
from tqdm.notebook import tqdm

from trl import SFTTrainer

In [None]:
dataset = load_dataset("notmehul/slicknotifications", split="train")

Downloading data:   0%|          | 0.00/80.2k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['input', 'output'],
    num_rows: 538
})

In [None]:
import pandas as pd
# Convert to DataFrame
df = pd.DataFrame(dataset)

df.head(5)

Unnamed: 0,input,output
0,character development fr,character development fr
1,Dard me koi masum pyara nhi hota... \n Dil ho ...,Dard me koi masum pyara nhi hota
2,Felt like becoming one among stars myself but ...,couldn't watch them die slowly
3,best season to be in this city?,best season to be in this city?
4,what are some safe spaces on the campus,safe spaces on the campus


In [None]:
# format according to model input
def format_row(row):
    input = row['input']
    output = row['output']
    formatted_string = f"[INST] find the most catchy substring in: {input} [/INST] {output} "
    return formatted_string

# Apply the function to each row of the dataframe
df['Formatted'] = df.apply(format_row, axis=1)
df['Formatted']

0      [INST] find the most catchy substring in: char...
1      [INST] find the most catchy substring in: Dard...
2      [INST] find the most catchy substring in: Felt...
3      [INST] find the most catchy substring in: best...
4      [INST] find the most catchy substring in: what...
                             ...                        
533    [INST] find the most catchy substring in: orga...
534    [INST] find the most catchy substring in: Abhi...
535    [INST] find the most catchy substring in: a fi...
536    [INST] find the most catchy substring in: bhoo...
537    [INST] find the most catchy substring in: most...
Name: Formatted, Length: 538, dtype: object

In [None]:
# Rename the 'Formatted' column to 'Text'
new_df = df.rename(columns={'Formatted': 'Text'})

new_df

Unnamed: 0,input,output,Text
0,character development fr,character development fr,[INST] find the most catchy substring in: char...
1,Dard me koi masum pyara nhi hota... \n Dil ho ...,Dard me koi masum pyara nhi hota,[INST] find the most catchy substring in: Dard...
2,Felt like becoming one among stars myself but ...,couldn't watch them die slowly,[INST] find the most catchy substring in: Felt...
3,best season to be in this city?,best season to be in this city?,[INST] find the most catchy substring in: best...
4,what are some safe spaces on the campus,safe spaces on the campus,[INST] find the most catchy substring in: what...
...,...,...,...
533,organized a successful study group for math,study group for math,[INST] find the most catchy substring in: orga...
534,Abhivyakti'24 ke pass mil gayeee??? kya aap sb...,Abhivyakti'24 ke pass mil gayeee,[INST] find the most catchy substring in: Abhi...
535,a film that accurately represents life at Bhar...,a film that accurately represents life,[INST] find the most catchy substring in: a fi...
536,bhookh lagi hai~ canteen ka Dosa out of stock ...,canteen ka Dosa out of stock hogya hai,[INST] find the most catchy substring in: bhoo...


In [None]:
new_df = new_df[['Text']]

new_df.head(3)

Unnamed: 0,Text
0,[INST] find the most catchy substring in: char...
1,[INST] find the most catchy substring in: Dard...
2,[INST] find the most catchy substring in: Felt...


In [None]:
#converting to csv
new_df.to_csv('formatted_data.csv', index=False)
final_df = pd.read_csv("formatted_data.csv")

training_dataset = load_dataset("csv", data_files="formatted_data.csv", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
training_dataset

Dataset({
    features: ['Text'],
    num_rows: 538
})

In [None]:
base_model = "microsoft/phi-2"
new_model = "notiphier"

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    trust_remote_code=True,
    flash_attn=True,
    flash_rotary=True,
    fused_dense=True,
    low_cpu_mem_usage=True,
    device_map={"": 0},
    revision="refs/pr/23",
)

model.config.use_cache = False
model.config.pretraining_tp = 1

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

training_arguments = TrainingArguments(
    output_dir="./notifier",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=32,
    evaluation_strategy="steps",
    eval_steps=2000,
    logging_steps=15,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_steps=2000,
    warmup_ratio=0.05,
    weight_decay=0.01,
    max_steps=-1
)

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["Wqkv", "fc1", "fc2" ] # specific to the model
)

trainer = SFTTrainer(
    model=model,
    train_dataset=training_dataset,
    peft_config=peft_config,
    dataset_text_field="Text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_arguments,
)

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/755 [00:00<?, ?B/s]

configuration_phi.py:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

modeling_phi.py:   0%|          | 0.00/33.7k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/577M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]



Map:   0%|          | 0/538 [00:00<?, ? examples/s]

In [None]:
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=80, training_loss=1.9998915851116181, metrics={'train_runtime': 863.8131, 'train_samples_per_second': 6.228, 'train_steps_per_second': 0.093, 'total_flos': 2446441145241600.0, 'train_loss': 1.9998915851116181, 'epoch': 9.52})

In [None]:
from transformers import pipeline

In [None]:
# fingers fucking crossed
prompt = "few days back i saw a girl in a fest uff her eyes!! she was wearing black tee and a cargo and she has a small diamond tattoo on her left hand since then i hadn’t seen her but yesterday she was near the nescafe can anyone tag her slick id"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100)
result = pipe(f"[INST] find the most catchy substring in:{prompt} [/INST]")
print(result[0]['generated_text'])

[INST] find the most catchy substring in:few days back i saw a girl in a fest uff her eyes!! she was wearing black tee and a cargo and she has a small diamond tattoo on her left hand since then i hadn’t seen her but yesterday she was near the nescafe can anyone tag her slick id [/INST] few days back i saw a girl in a fest uff her eyes 

i was wearing black tee and a cargo 




In [None]:
trainer.save_model("./notifier")
! cd /content/./notifier
! git config --global init.defaultBranch https://huggingface.co/notmehul/notiphier
! git init && git remote add origin && git pull origin main

Reinitialized existing Git repository in /content/.git/
usage: git remote add [<options>] <name> <url>

    -f, --fetch           fetch the remote branches
    --tags                import all tags and associated objects when fetching
                          or do not fetch any tag at all (--no-tags)
    -t, --track <branch>  branch(es) to track
    -m, --master <branch>
                          master branch
    --mirror[=(push|fetch)]
                          set up remote as a mirror to push to or fetch from

