# Все протестированные идеи


### 1.  Крупное opensource LLM (phi-3.5 mini)
- С использованием библиотеки unsloth инференс быстрее в 2 раза, была дотюнина на датасете
- Модель достаточно мала для LLM (3.8B). Плохо знает русский.
- Мной была обучена на [датасете QA](https://huggingface.co/datasets/kuznetsoffandrey/sberquad) (600K примеров, контекст соединялся с вопросом)
- Работает хорошо, но для cpu очень медленно. На tesla t4 время ответа 4сек.

### 2.   Более мелкая модель (bertQA) затюниный на крупном датасете
- Взят DistillBert (200мб)
- Обучен на поиск токенов начала и конца ответов



# Phi-3.5 mini inference

In [None]:
%%capture
!pip install unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template

In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
example_data = load_dataset("philschmid/guanaco-sharegpt-style", split = "train")
sberquad_data = load_dataset("kuznetsoffandrey/sberquad", split = "train")

In [None]:
sberquad_data[145]

In [None]:
example_data['conversations'][5]

In [None]:
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

In [None]:
example_data = example_data.map(formatting_prompts_func, batched = True,)

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Они представлены известковыми выделениями сине-зелёных водорослей. Чем представлены органические остатки?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

In [None]:
outputs = model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

# BertQA

In [None]:
!pip install datasets pymystem3

In [None]:
!mkdir Roberta_base

In [None]:
from datasets import load_dataset
from transformers import DefaultDataCollator, AutoTokenizer, pipeline, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from tqdm.auto import tqdm
import torch
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
example_data = load_dataset("squad", split="train[:5000]")
squad = load_dataset("kuznetsoffandrey/sberquad")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("deepset/tinyroberta-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("deepset/tinyroberta-squad2")

In [None]:
data_collator = DefaultDataCollator()

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

In [None]:
mystem = Mystem()
russian_stopwords = stopwords.words("russian")


def normalize_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]

    return text


def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction)
    truth_tokens = normalize_text(truth)

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)


def compute_batch_f1(data, pipeline):
    all_f1 = 0
    for cur in tqdm(data):
        question, context, ans = cur['question'], cur['context'], cur['answers']['text'][0]\

        all_f1 += compute_f1(pipeline(question, context)['answer'], ans)

    return all_f1/len(data)

In [None]:
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)

In [None]:
part_val = load_dataset("kuznetsoffandrey/sberquad", split="validation[:3000]")
mean_f1_score = compute_batch_f1(part_val, question_answerer)

In [None]:
training_args = TrainingArguments(
    output_dir="DistilBert_check",
    eval_strategy="steps",
    eval_steps=750,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy='steps',
    save_steps=750
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
!zip -r check_distillBertQA_5.zip /kaggle/working/DistilBert_check/checkpoint-7085

In [None]:
!rm /kaggle/working/check_distillBertQA_5.zip

In [None]:
question_answerer(squad['test'][243]['question'], squad['test'][243]['context'])