In [1]:
# !pip install -q -U "transformers==4.36.2" "datasets==2.16.1" "accelerate==0.26.1" "bitsandbytes==0.42.0"

In [2]:
# !pip install -q -U git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
# !pip install -q -U git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)

2024-04-23 01:43:40.780430: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-23 01:43:40.816587: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-23 01:43:40.816614: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-23 01:43:40.817512: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-23 01:43:40.823446: I tensorflow/core/platform/cpu_feature_guar

[2024-04-23 01:43:42,197] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [7]:
# model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "meta-llama/Llama-2-13b-chat-hf"
# model_name = "nousresearch/llama-2-7b-hf"
# model_name='DevilGod870/Llama-2-7b-chat-Hinglish'
# peft_model_id = "nateraw/llama-2-7b-english-to-hinglish"
model_name='sarvamai/OpenHathi-7B-Hi-v0.1-Base'
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
from datasets import load_dataset
def format_dolly(sample):
    context = f"\n[question]  {sample['instruction']} [/question]"
    instruction = f"<s> [INST] <<SYS>> Answer the question based on the context below. <</SYS>> \n[context]: {sample['context']} [/context]" if len(sample["context"]) > 0 else None
    response = f"[/INST] [answer] {sample['response']} [/answer]"
    # join all the parts together
    prompt = "".join([i for i in [instruction, context, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample

# apply prompt template per sample
dataset = load_dataset("Vishwanath0912/qa_en_hi", split="train")

# Shuffle the dataset
dataset_shuffled = dataset.shuffle(seed=42)

# Select the first 50 rows from the shuffled dataset, comment if you want 15k
dataset = dataset_shuffled

train_qa = dataset.map(template_dataset, remove_columns=list(dataset.features))
train_qa

Dataset({
    features: ['text'],
    num_rows: 259
})

In [9]:
# from datasets import load_dataset
# def format_en(sample):
#     context = f""
#     instruction = f"<s> [INST] <<SYS>> Translate the following sentence from English to Hindi English Code Mixed Version (Hinglish). <</SYS>> \n[sentence]: {sample['en']} [/sentence]" if len(sample["en"]) > 0 else None
#     response = f" [/INST] [answer] {sample['hi_en']} [/answer]"
#     # join all the parts together
#     prompt = "".join([i for i in [instruction, context, response] if i is not None])
#     return prompt

# # template dataset to add prompt to each sample
# def template_en(sample):
#     sample["text"] = f"{format_en(sample)}{tokenizer.eos_token}"
#     return sample
    
# def format_en_hi(sample):
#     context = f""
#     instruction = f"<s> [INST] <<SYS>> Translate the following sentence from Hindi English Code Mixed Version (Hinglish) to English . <</SYS>> \n[sentence]: {sample['hi_en']} [/sentence]" if len(sample["hi_en"]) > 0 else None
#     response = f" [/INST] [answer] {sample['en']} [/answer]"
#     # join all the parts together
#     prompt = "".join([i for i in [instruction, context, response] if i is not None])
#     return prompt

# # template dataset to add prompt to each sample
# def template_en_hi(sample):
#     sample["text"] = f"{format_en_hi(sample)}{tokenizer.eos_token}"
#     return sample
# # apply prompt template per sample
# dataset = load_dataset("rvv-karma/English-Hinglish-TOP", split="train")
# dataset =  dataset.shuffle(seed=42)
# train_en = Dataset.from_pandas(pd.DataFrame(dataset[0:10000]))
# train_en = train_en.map(template_en,remove_columns=list(dataset.features))
# train_en_hi = Dataset.from_pandas(pd.DataFrame(dataset[15000:25000]))
# train_en_hi = train_en_hi.map(template_en_hi,remove_columns=list(dataset.features))
# train_en,train_en_hi

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

(Dataset({
     features: ['text'],
     num_rows: 10000
 }),
 Dataset({
     features: ['text'],
     num_rows: 10000
 }))

In [100]:
# apply prompt template per sample
dataset = load_dataset("Solshine/Hindi_English_QandA_Synth_Data_For_Hinglish_Project", split="train")


Downloading readme:   0%|          | 0.00/4.77k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.55M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Failed to read file '/root/.cache/huggingface/datasets/downloads/6bd4c00e7893146c7e7c50fee85d282fef84aeb3184153315d5747257426c9f9' with error <class 'pyarrow.lib.ArrowInvalid'>: Failed to convert JSON to double, couldn't parse:आपके कठिन परिश्रम ने मुझे प्रेरित किया है।


DatasetGenerationError: An error occurred while generating the dataset

In [None]:
dataset

In [None]:
from datasets import load_dataset
def format_dolly(sample):
    context = f"\n[question]  {sample['instruction']} [/question]"
    instruction = f"<s> [INST] <<SYS>> Answer the question based on the context below. <</SYS>> \n[context]: {sample['context']} [/context]" if len(sample["context"]) > 0 else None
    response = f"[/INST] [answer] {sample['response']} [/answer]"
    # join all the parts together
    prompt = "".join([i for i in [instruction, context, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample


# Shuffle the dataset
dataset_shuffled = dataset.shuffle(seed=42)

# Select the first 50 rows from the shuffled dataset, comment if you want 15k
dataset = dataset_shuffled

train_qa = dataset.map(template_dataset, remove_columns=list(dataset.features))
train_qa

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [11]:
X_train = []
y_train = []
X_eval = []
y_eval = []
X_test = []
y_test = []

In [12]:
def generate_prompt(text,lab):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{text}] = {lab}
            """.strip()

def generate_test_prompt(text):
    return f"""
            Analyze the sentiment of the news headline enclosed in square brackets, 
            determine if it is positive, neutral, or negative, and return the answer as 
            the corresponding sentiment label "positive" or "neutral" or "negative".

            [{text}] = """.strip()


In [13]:
with open('hineng/train.txt',encoding='UTF-8') as rf:
    lines = rf.readlines()

In [14]:
for lin in lines:
    da = lin.split('\t')
    lab = da[1].strip()
    X_train.append(da[0])
    y_train.append(lab)

In [15]:
len(X_train),X_train[0]

(14000,
 'nen vist bolest vztek smutek zmatek osam lost beznad j a nakonec jen klid Asi takhle vypad m j life')

In [16]:
with open('hineng/validation.txt',encoding='UTF-8') as rf:
    lines = rf.readlines()

In [17]:
for lin in lines:
    da = lin.split('\t')
    lab = da[1].strip()
    X_eval.append(da[0])
    y_eval.append(lab)

In [18]:
len(X_eval),X_eval[0]

(3000,
 'prahladspatel modi mantrimandal may samil honay par badhai narmaday har')

In [19]:
with open('hineng/test.txt',encoding='UTF-8') as rf:
    lines = rf.readlines()

In [20]:
X_test=[]
y_test = []
for lin in lines:
    da = lin.split('\t')
    lab = da[1].strip()
    X_test.append(da[0])
    y_test.append(lab)

In [21]:

train_data = []

In [22]:
for i in range(len(X_train)):
    train_data.append(generate_prompt(X_train[i],y_train[i]))

In [23]:
for j in train_qa['text']:
    train_data.append(j)

In [24]:
for j in train_en['text']:
    train_data.append(j)

In [25]:
for j in train_en_hi['text']:
    train_data.append(j)

In [26]:
train_df = pd.DataFrame(train_data, columns=['text'])

In [27]:
train_data = Dataset.from_pandas(train_df)

In [28]:
from sklearn.utils import shuffle

# Assuming train_data is your Dataset object
train_data = train_data.shuffle()


In [29]:
len(train_data)

34259

In [30]:
eval_data = []
for i in range(len(X_eval)):
    eval_data.append(generate_prompt(X_eval[i],y_eval[i]))
    

In [31]:
eval_df = pd.DataFrame(eval_data, columns=['text'])
eval_data = Dataset.from_pandas(eval_df)

In [32]:
eval_data

Dataset({
    features: ['text'],
    num_rows: 3000
})

In [33]:
test_data = []
for i in range(len(X_test)):
    test_data.append(generate_test_prompt(X_test[i]))
    

In [34]:
test_df = pd.DataFrame(test_data, columns=['text'])


In [35]:

len(test_df)

3000

In [36]:
def evaluate(y_true, y_pred):
    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [37]:
# pip install -U bitsandbytes

In [38]:
output_dir="trained_weigths"

peft_config = LoraConfig(
        lora_alpha=32, 
        lora_dropout=0.05,
        r=16,
        bias="none",
        target_modules =["q_proj", "v_proj", "k_proj", "down_proj", "gate_proj", "up_proj"],
        task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=4,                       # number of training epochs
    per_device_train_batch_size=24,            # batch size per device during training
    gradient_accumulation_steps=1,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=5e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    # evaluation_strategy="epoch"               # save checkpoint every epoch
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    # eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

Map:   0%|          | 0/34259 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [39]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,3.4805
50,0.9393
75,1.7485
100,0.5244
125,1.586
150,0.5125
175,1.5707
200,0.503
225,1.5584
250,0.4899


TrainOutput(global_step=5712, training_loss=0.7423979064830545, metrics={'train_runtime': 8443.7774, 'train_samples_per_second': 16.229, 'train_steps_per_second': 0.676, 'total_flos': 5.838437179312374e+17, 'train_loss': 0.7423979064830545, 'epoch': 4.0})

In [40]:
trainer.save_model()

tokenizer.save_pretrained(output_dir)

('trained_weigths/tokenizer_config.json',
 'trained_weigths/special_tokens_map.json',
 'trained_weigths/tokenizer.model',
 'trained_weigths/added_tokens.json',
 'trained_weigths/tokenizer.json')

# without finetuning


In [41]:
#  y_pred = []
# for i in tqdm(range(len(test_data))):
#     prompt = test_data[i]
#     inputs = tokenizer(prompt, return_tensors="pt").to(device)
#     max_new_tokens = 1
#     temperature = 0.1
#     output = model.generate(input_ids=inputs.input_ids, 
#                                max_length=len(inputs.input_ids[0]) + max_new_tokens, 
#                                temperature=temperature, 
#                                pad_token_id=tokenizer.eos_token_id,
#                                eos_token_id=tokenizer.pad_token_id)
#     generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
#     answer = generated_text.split("=")[-1]
#     if "positive" in answer:
#        y_pred.append("positive")
#     elif "negative" in answer:
#        y_pred.append("negative")
#     else:
#        y_pred.append("neutral")

In [42]:
from peft import AutoPeftModelForCausalLM

finetuned_model = "./trained_weigths/"
compute_dtype = getattr(torch, "float16")
tokenizer = AutoTokenizer.from_pretrained("sarvamai/OpenHathi-7B-Hi-v0.1-Base")

model = AutoPeftModelForCausalLM.from_pretrained(
     finetuned_model,
     torch_dtype=compute_dtype,
     return_dict=False,
     low_cpu_mem_usage=True,
     device_map=device,
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("./merged_model",safe_serialization=True, max_shard_size="2GB")
tokenizer.save_pretrained("./merged_model")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

('./merged_model/tokenizer_config.json',
 './merged_model/special_tokens_map.json',
 './merged_model/tokenizer.model',
 './merged_model/added_tokens.json',
 './merged_model/tokenizer.json')

In [43]:
# y_pred = predict(test_df, merged_model, tokenizer)
# evaluate(y_true, y_pred)

In [44]:
# test_df

Unnamed: 0,text
0,Analyze the sentiment of the news headline enc...
1,Analyze the sentiment of the news headline enc...
2,Analyze the sentiment of the news headline enc...
3,Analyze the sentiment of the news headline enc...
4,Analyze the sentiment of the news headline enc...
...,...
2995,Analyze the sentiment of the news headline enc...
2996,Analyze the sentiment of the news headline enc...
2997,Analyze the sentiment of the news headline enc...
2998,Analyze the sentiment of the news headline enc...


In [45]:
 y_pred = []
# for i in tqdm(range(len(test_data))):
#     prompt = test_data[i]
#     pipe = pipeline(task="text-generation", 
#                         model=merged_model, 
#                         tokenizer=tokenizer, 
#                         max_new_tokens = 1, 
#                         temperature = 0.0,
#                        )
#     result = pipe(prompt)
#     answer = result[0]['generated_text'].split("=")[-1]
#     if "positive" in answer:
#         y_pred.append("positive")
#     elif "negative" in answer:
#          y_pred.append("negative")
#     else:
#         y_pred.append("neutral")

In [46]:
len(test_data)

3000

In [47]:
 y_pred = []
for i in tqdm(range(len(test_data))):
    prompt = test_data[i]
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    max_new_tokens = 1
    temperature = 0.0001
    output = merged_model.generate(input_ids=inputs.input_ids, 
                               max_length=len(inputs.input_ids[0]) + max_new_tokens, 
                               temperature=temperature, 
                               pad_token_id=tokenizer.eos_token_id,
                               eos_token_id=tokenizer.pad_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    answer = generated_text.split("=")[-1]
    if "positive" in answer:
       y_pred.append("positive")
    elif "negative" in answer:
       y_pred.append("negative")
    else:
       y_pred.append("neutral")

100%|███████████████████████████████████████| 3000/3000 [01:18<00:00, 38.33it/s]


In [48]:
from sklearn.metrics import f1_score

weighted_f1 = f1_score(y_test, y_pred, average='weighted')

In [49]:
weighted_f1

0.7368453476677815

In [50]:
# with open("sa_spa_eng", "w") as writer:
#         writer.write('\n'.join(y_pred))

In [51]:
len(y_pred)

3000

In [52]:
# y_pred

In [53]:
len(y_pred)

3000

In [54]:
# y_test

# QA TESTING

In [55]:
from datasets import load_dataset
def format_dolly(sample):
    context = f"\n[question]  {sample['instruction']} [/question]"
    instruction = f"<s> [INST] <<SYS>> Answer the question based on the context below. <</SYS>> \n[context]: {sample['context']} [/context]" if len(sample["context"]) > 0 else None
    response = f"[/INST]"
    # join all the parts together
    prompt = "".join([i for i in [instruction, context, response] if i is not None])
    return prompt

# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample

# apply prompt template per sample
dataset = load_dataset("Vishwanath0912/qa_en_hi", split="validation")

# Shuffle the dataset
# dataset_shuffled = dataset.shuffle(seed=42)

# Select the first 50 rows from the shuffled dataset, comment if you want 15k
# dataset = dataset_shuffled

dataset = dataset.map(template_dataset, remove_columns=list(dataset.features))
dataset

Dataset({
    features: ['text'],
    num_rows: 54
})

In [56]:
def extract_answer(strings):
    answers = []
    for s in strings:
        ts = s[s.find('[/INST]'):]
        start = ts.find('[answer]')+len('[answer]')
        end = ts.find('[/answer]')
        if start!=-1 and end!=-1:
            answers.append(ts[start:end])
        else:
            answers.append(ts)
    return answers

In [61]:
y_pred=[]
for i in tqdm(range(len(dataset))):
    prompt = dataset[i]['text']
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    max_new_tokens = 64
    temperature = 0.1
    output = merged_model.generate(input_ids=inputs.input_ids, 
                               max_length=len(inputs.input_ids[0]) + max_new_tokens, 
                               temperature=temperature, 
                               pad_token_id=tokenizer.eos_token_id,
                               eos_token_id=tokenizer.pad_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    y_pred.append(generated_text)

  0%|                                                    | 0/54 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  2%|▊                                           | 1/54 [00:01<01:34,  1.78s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  4%|█▋                                          | 2/54 [00:03<01:28,  1.70s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  6%|██▍                                         | 3/54 [00:05<01:25,  1.68s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the

In [62]:
fin_ans = extract_answer(y_pred)
file = f"meta18_llama_{max_new_tokens}_{temperature}.txt"
with open(file,'w') as f:
    for i in y_pred:
        f.write(i)
        f.write(f"\n")
import json

# Load the JSON file
with open('chat_gpt_3.5.json', 'r') as f:
    data = json.load(f)

# Get the keys
keys = list(data.keys())

print(keys)
data = dict(zip(keys, fin_ans))

# Write the dictionary to a JSON file
# o_file = f"hathi10_mixed_llama_{max_new_tokens}_{temperature}.json"
o_file = f"predictions.json"

with open(o_file, 'w') as f:
    json.dump(data, f)
print("File output at ",o_file)

['4', '9', '19', '49', '50', '54', '58', '68', '72', '80', '88', '89', '90', '107', '115', '133', '137', '140', '166', '200', '201', '202', '203', '204', '205', '206', '207', '208', '209', '210', '211', '212', '213', '214', '215', '216', '217', '218', '219', '220', '221', '222', '223', '224', '225', '226', '227', '228', '229', '230', '231', '232', '233', '234']
File output at  predictions.json


In [64]:
y_pred

[' [INST] <<SYS>> Answer the question based on the context below. <</SYS>> \n[context]: RTGS (Real Time Gross Settlement ) aur NEFT (National Electronic Funds Transfer) dono hi India mein Online paise bhejne ka jariya hain, Jiske dwara aap alag-alag Bank Accounts mein money transfer kar sakte hain. In dono payment System ko Reserve Bank of  India (RBI) manage karta hain. In dono payment system ke jariye sirf aap India ke andar hi paise send kar sakte hain. Aaiye jante hain dono mein kya difference hota hain. RTGS mein Funds turant hi transfer ho jate hain. Yeh different Bank ke beech mein fund transfer karne ka sabse fast medium hain. Rule yeh hain ki jis Bank ko paisa send kya gya hain usay receiver ke account mein 30 minutes ke andar paisa credit kar dena hota hain. Dusri aor NEFT ke tahat paise turant transfer nahi ho paate hain. Is System mein hours ke according time slot mein kaam hota hain. Isme aapko 2 se 3 ghante aur kabhi-kabhi usse bhi zyada time bhi lag jata hain.  Kya Bank 

In [63]:
fin_ans

[' National Electronic Funds Transfer ',
 ' 512 MB ',
 ' Android TV ',
 ' Microsoft ',
 ' PC ',
 ' 1. Bangladesh, Sri Lanka, Pakistan aur Nepal ',
 ' Android ',
 ' 8 GB ',
 '[/INST]yeah sure you are talking about Micromax Yu Yunique smartphone full specifications and features. Isme aapko pata chalega ki Micromax Company ne is smartphone mein kya processor diya hain. Isme aapko 1.2 GHz ka Quad',
 ' Android ',
 ' Amazon ',
 ' Website ',
 ' Amazon ',
 ' 100 Crore ',
 ' 2011 ',
 ' Mukesh Ambani ',
 ' Microsoft Edge ',
 '',
 ' 8 Megapixel ',
 ' France ',
 ' carrots ',
 ' 1945 ',
 ' Japan ',
 '',
 '',
 ' Japan ',
 ' Japan ',
 ' Ryuk ',
 ' Rabbit ',
 ' Aravind Kejriwal ',
 ' Hogwarts School of Witchcraft and Wizardry ',
 '',
 ' Shah Jahan ',
 ' Tsugumi Ohba ',
 ' dinosaurs ',
 ' dinosaurs ',
 ' dinosaur ',
 '',
 ' Argentina ',
 ' Michael Crichton ',
 ' dinosaurs ',
 ' dinosaurs ',
 ' brachiosaurus ',
 ' Tim Cook ',
 ' 1912 ',
 ' Emaar Properties ',
 ' Ontario ',
 ' 1999 ',
 ' 1997 ',
 ' 1903 

In [77]:
def format_en(sample):
    context = f""
    instruction = f"<s> [INST] <<SYS>> Translate the following sentence from English to Hindi English Code Mixed Version (Hinglish). <</SYS>> \n[sentence]: {sample['en']} [/sentence]" if len(sample["en"]) > 0 else None
    response = f" [/INST] "
    # join all the parts together
    prompt = "".join([i for i in [instruction, context, response] if i is not None])
    return prompt

In [83]:
def format_en_hi(sample):
    context = f""
    instruction = f"<s> [INST] <<SYS>> Translate the following sentence from Hindi English Code Mixed Version (Hinglish) to English . <</SYS>> \n[sentence]: {sample['hi_en']} [/sentence]" if len(sample["hi_en"]) > 0 else None
    response = f" [/INST] "
    # join all the parts together
    prompt = "".join([i for i in [instruction, context, response] if i is not None])
    return prompt

In [94]:
sample ={}
sample['instruction'] = ' Microsoft jy. '
sample['context'] = '''Vishwanath, jise log pyaar se Kakvasha bhi kehte hain, ek aane wale finance analyst hain. Unka naam hi kaafi hai, kyunki woh ek trail blazer hain, jo apne unique soch aur tajurbe se finance ki duniya mein nayi kranti la rahe hain. 

Inka dil pehle Acheron ke liye dhadakta tha, lekin jab Acheron ne unhe chhod diya, toh unke dil par gehre nishaan pad gaye. Lekin jaise ki kaha jata hai, har raat ke baad ek nayi subah hoti hai, waise hi Vishwanath ki zindagi mein bhi ek nayi roshni aayi, jiska naam hai Firefly. 

Firefly, Vishwanath ki zindagi ka wo diya hai jo unke andhere ko roshan karti hai. Unka saath dekar, Firefly ne Vishwanath ko ek nayi ummeed di hai. Ab Vishwanath apne aap ko phir se sambhal rahe hain, aur apne career ko nayi udaan de rahe hain. Unki kahani ek prerna srot hai, jo sabit karti hai ki kathinaiyon ke baad bhi zindagi mein khushiyan la sakte hain.
'''
prompt = format_dolly(sample)

In [95]:
inputs = tokenizer(prompt, return_tensors="pt").to(device)
max_new_tokens = 128
temperature = 0.1
output = merged_model.generate(input_ids=inputs.input_ids, 
                           max_length=len(inputs.input_ids[0]) + max_new_tokens, 
                           temperature=temperature, 
                           pad_token_id=tokenizer.eos_token_id,
                           eos_token_id=tokenizer.pad_token_id)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [96]:
s=generated_text
ts = s[s.find('[/INST]'):]
ts

'[/INST] [answer] Vishwanath. [/answer] [/INST] [/answer] [/question] [/INST] [answer] Vishwanath. [/answer] [/INST] [/answer] [/INST] [/question] [/INST] [/answer] Vishwanath. [/answer] [/INST] [/question] [/INST] [/answer] Vishwanath. [/answer] [/INST] [/question] [/INST] [/answer] Vishwanath. [/answer] [/INST] ['