In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 pandas datasets openai tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import re
from collections import defaultdict

import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import pandas as pd
import datasets
from openai import OpenAI
from tqdm import tqdm

In [None]:
dataset_path = "/content/drive/MyDrive/nlp/ukr_lit/texts/1"
save_path = "/content/drive/MyDrive/nlp/shevchenko_prompts.csv"

client = OpenAI(api_key=os.environ['api_key'])

def label_data(data):

    prompt = f"""
    Якою була б вказівка написати подібний вірш, якби його було можливо написати за замовленням?
    Text: "
       {data}
       "
    Response: Відповідь повинна включати тільки коротку інструкцію, від імені замовника
    """



    completion = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[
          {"role": "system", "content": ""},
          {"role": "user", "content": prompt}
      ]
    )

    response = completion.choices[0].message.content
    return response

if not os.path.exists(save_path):
  files = defaultdict(list)
  for file in tqdm(os.listdir(dataset_path)):
    if file.endswith(".txt"):
        with open(os.path.join(dataset_path, file), 'r') as f:
            text = f.read()
        text = re.sub(r'\n+', '\n', text)
        title = text.split('\n')[1]
        verse = '\n'.join(text.split('\n')[4:])
        files['text'].append(verse)
        files['label'].append(title)
        files['file'].append(file)
        try:
          prompt = label_data(verse)
          files['prompt'].append(prompt)
        except Exception as e:
          print(f"{file}: {e}")
          files['prompt'].append("")
  df = pd.DataFrame.from_dict(files)
  df.to_csv(save_path, index=False)
else:
  print('Loaded from cache')
  df = pd.read_csv(save_path)

Loaded from cache


In [None]:
def generate_train_data(row):
  prompt = row['prompt']
  instruction = "You are a Ukrainian poet Taras Shevchenko. Create literature"
  verse = row['text']

  full_prompt = f"""<s>### Instruction:
  {instruction}

  ### Input:
  {prompt}

  ### Response:
  {verse}
  </s>
  """

  row['train_data'] = full_prompt
  return row

dataset = datasets.Dataset.from_pandas(df)

dataset = dataset.filter(lambda x: x["prompt"] is not None)
dataset = dataset.map(generate_train_data)
dataset.to_pandas().head(5)

Filter:   0%|          | 0/264 [00:00<?, ? examples/s]

Map:   0%|          | 0/263 [00:00<?, ? examples/s]

Unnamed: 0,text,label,file,prompt,train_data
0,"Блаженний муж на лукаву\nНе вступає раду,\nІ ...",Блажений муж на лукаву...,15376.txt,"Напишіть вірш, який пропагує добрі діла та пок...",<s>### Instruction:\n You are a Ukrainian poe...
1,"Бували войни й військовії свари:\nГалаґани, і...",Бували войни й військовії свари,23421.txt,"Напишіть вірш, що використовує жанр сатири та ...",<s>### Instruction:\n You are a Ukrainian poe...
2,"Злоначинающих спини,\nУ пута кутії не куй,\nВ...",Злоначинающих спини...,13794.txt,"Напишіть вірш, що закликає до добра, підтримки...",<s>### Instruction:\n You are a Ukrainian poe...
3,"Фрагмент з поеми ""Княжна""\nЗоре моя вечірняя,\...",Зоре моя вечірняя,20032.txt,"Напишіть вірш, що відтворює красу природи та н...",<s>### Instruction:\n You are a Ukrainian poe...
4,Н. Я. МАКАРОВУ\nНа пам'ять 14 сентября\nБарвін...,Барвінок цвів і зеленів,23419.txt,Напишіть вірш на основі подібної структури та ...,<s>### Instruction:\n You are a Ukrainian poe...


In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

# Load base model
new_model = "llama-2-7b-chat-shevchenko"
model_name = "NousResearch/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": 0}
)
model.config.use_cache = True
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir="/content/drive/MyDrive/nlp",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=3e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="train_data",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Map:   0%|          | 0/263 [00:00<?, ? examples/s]

In [None]:
trainer.train()

Step,Training Loss
25,2.2193
50,2.1748
75,2.1767
100,1.8862
125,1.917
150,1.7447
175,1.6708
200,1.6579
225,1.4965
250,1.416


TrainOutput(global_step=660, training_loss=1.157061597072717, metrics={'train_runtime': 5712.0328, 'train_samples_per_second': 0.46, 'train_steps_per_second': 0.116, 'total_flos': 3.36221034307584e+16, 'train_loss': 1.157061597072717, 'epoch': 10.0})

In [None]:
trainer.model.save_pretrained(os.path.join('/content/drive/MyDrive/nlp', new_model))
trainer.tokenizer.save_pretrained(os.path.join('/content/drive/MyDrive/nlp', new_model))

('/content/drive/MyDrive/nlp/llama-2-7b-chat-shevchenko/tokenizer_config.json',
 '/content/drive/MyDrive/nlp/llama-2-7b-chat-shevchenko/special_tokens_map.json',
 '/content/drive/MyDrive/nlp/llama-2-7b-chat-shevchenko/tokenizer.model',
 '/content/drive/MyDrive/nlp/llama-2-7b-chat-shevchenko/added_tokens.json',
 '/content/drive/MyDrive/nlp/llama-2-7b-chat-shevchenko/tokenizer.json')

In [None]:
# new_model = "llama-2-7b-chat-shevchenko"
# model_name = "NousResearch/Llama-2-7b-chat-hf"

# base_model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     low_cpu_mem_usage=True,
#     torch_dtype=torch.float16,
#     device_map={"": 0},
# )
# model = PeftModel.from_pretrained(base_model, os.path.join('/content/drive/MyDrive/nlp', new_model))
# model = model.merge_and_unload()

# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

In [None]:
prompt = """Напишіть вірш про красу природи. Акцентуйте увагу на швидкоплинності часу. Довжина вірша не більше 12 рядків."""
instruction = "You are a Ukrainian poet Taras Shevchenko. Create literature."

full_prompt = f"""<s>### Instruction:
{instruction}

### Input:
{prompt}

"""

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=600, pad_token_id=tokenizer.eos_token_id)
result = pipe(full_prompt)
print(result[0]['generated_text'])

<s>### Instruction:
You are a Ukrainian poet Taras Shevchenko. Create literature.

### Input:
Напишіть вірш про красу природи. Акцентуйте увагу на швидкоплинності часу. Довжина вірша не більше 12 рядків.

### Response:
Зима пропадає,
Серце немає,
А зоря, як тая куня,
Мір заснула.
Почуєш — гріє сонце,
Місто виходить
На улицю, на поле —
На пригоду.
Не журиться, не журитьсь,
Найдеться зоря,
Місяця небащесія,
Зима проспіє.
Кругом неба й землі
Верби зелені,
А на полі утоптані
Верба й гойдаються.
На тім полі на гаю
Вишневий гай,
І вишні — чорнії, малі,
Десь на гаї
Росли, розпустились,
Землі небаї.
Кругом їх степи, поле,
І над ними
Сонечко гріє, любить,
І кайдани…
Нема й того. Гойдається,
Небаї… нема.
[Кос-Арал 1849]
 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 



In [None]:
prompt = """Напишіть вірш, що використовує жанр сатири для викриття соціальних проблем."""

full_prompt = f"""</s><s>### Instruction:
{instruction}

### Input:
{prompt}

"""

result = pipe(full_prompt)
print(result[0]['generated_text'])

</s><s>### Instruction:
You are a Ukrainian poet Taras Shevchenko. Create literature.

### Input:
Напишіть вірш, що використовує жанр сатири для викриття соціальних проблем.

### Response:
Неначе в п'яний стан
Звільніть у мене раду,
Нехай, бачте, вигляне
Моя вдовиця. Зять, брате!
Хоч не в п'яний, але не в раді
Моє серце б'ється. Нехай
Вигляне, хвалиться, лає
Мою вдовицю. Зять, брате!
Моя вдовиця, моя слава.
Нехай лає. Нехай бог
Вам, моя слава, дає…
Та дає й мені. Нехай:
Моя слава — і доведе
Самому мені, що тая
Або щея. Боже милий!
Моя вдовиця, моя слава!
Нехай лає. Нехай бог
Вам, моя слава, дає…
Та дає й мені. Нехай:
Моя слава — і доведе
Самому мені, що тая
Або щея. Боже милий!
Моя вдовиця, моя слава!
<e>
 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 
