<a href="https://colab.research.google.com/github/vitaliy-sharandin/data_science_projects/blob/master/portfolio/nlp/fine-tuned-llm/wisai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WisAI
### WisAI model is a GPT-NeoX-20B model fine-tuned on philosophical and psychological data and configured to provide useful advice.

In [30]:
!pip install -U -q gradio
!pip install -U -q transformers
!pip install -U -q datasets
!pip install -U -q accelerate
!pip install -U -q bitsandbytes
!pip install -U -q peft
!pip install -U -q trl

!pip install -U -q evaluate
!pip install -U -q rouge_score

In [31]:
from google.colab import drive
import json
import yaml
import gradio as gr
import torch
from transformers import GenerationConfig, Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer
from datasets import Dataset
from peft import LoraConfig
import numpy as np
from evaluate import load

In [32]:
# Small model
small_model_name = "EleutherAI/gpt-neo-125M"

small_tokenizer = AutoTokenizer.from_pretrained(small_model_name)
small_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# small_model = AutoModelForCausalLM.from_pretrained(small_model_name)



# Base model
model_name = "EleutherAI/gpt-neox-20b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0}
)

# model = AutoModelForCausalLM.from_pretrained(main_model_name)
# # model.resize_token_embeddings(len(tokenizer))

torch.manual_seed(42)
print(base_model)

# Training

## Training datasets list

### Psychology and mental health datasets

#### Text datasets


* Kaggle Psychometrics dataset https://www.kaggle.com/discussions/general/304994
* Psychometric tests dataset https://ieee-dataport.org/documents/psychometric-tests-dataset
* Psychometric NLP https://paperswithcode.com/dataset/psychometric-nlp
* Reddit mental health dataset https://zenodo.org/record/3941387
* Reddit mental disorders identification https://www.kaggle.com/datasets/kamaruladha/mental-disorders-identification-reddit-nlp
* Kaggle Mental Health Conversational Data https://www.kaggle.com/datasets/elvis23/mental-health-conversational-data
* Kaggle Mental Health FAQ for Chatbot https://www.kaggle.com/narendrageek/mental-health-faq-for-chatbot/code
* A human consciousness questionnaire dataset https://data.mendeley.com/datasets/69p62ksdh6
* paperswithcode Self-reported Mental Health Diagnoses https://paperswithcode.com/dataset/smhd
* paperswithcode Mental Health Summarization Dataset https://paperswithcode.com/dataset/mentsum
* HuggingFace psychology dataset https://huggingface.co/datasets/samhog/psychology-10k

#### Text2Text datasets
* Kaggle Depression data for chatbot https://www.kaggle.com/datasets/nupurgopali/depression-data-for-chatbot

#### Classification datasets
* Classification for mental health https://www.kaggle.com/datasets/reihanenamdari/mental-health-corpus
* Depression identification https://www.kaggle.com/datasets/infamouscoder/depression-reddit-cleaned

### Philosophy datasets
* https://www.kaggle.com/datasets/christopherlemke/philosophical-texts
* https://www.workwithdata.com/object/philosophy-science-complete-a-text-on-traditional-problems-schools-thought-book-by-edwin-h-c-hung-0000
* https://www.kaggle.com/datasets/christopherlemke/philosophy-authors-writings-german
* https://www.workwithdata.com/object/philosophical-inquiries-an-introduction-to-problems-philosophy-book-by-nicholas-rescher-0000
* https://www.workwithdata.com/object/roman-stoicism-book-by-edward-vernon-arnold-1857
* https://www.workwithdata.com/object/wisdom-energy-basic-buddhist-teachings-book-by-thubten-yeshe-1935

## Training dataset creation

#### Data load and utility methods

In [None]:
drive.mount('/content/drive')

depression_data = []

with open('/content/drive/MyDrive/Data/depression.yml', 'r') as file:
     depression_data = yaml.safe_load(file)

In [None]:
def parse_depression_dataset(conversations):
  output = {'instruction':[],'response':[]}
  for convo in conversations:
    completion = ''
    for i, dialog in enumerate(convo):
      if i == 0:
        prompt = dialog
        # p_encode = prompt.encode("ascii", "ignore")
        # prompt = p_encode.decode()
        prompt = prompt.replace("\xa0", " ")
        # print('prompt:',prompt)
      else:
        completion += " " + dialog
        # c_encode = completion.encode("ascii", "ignore")
        # completion = c_encode.decode()
        completion = completion.replace("\xa0", " ")
    completion = completion.strip()
    # print(line)
    output['instruction'].append(prompt)
    output['response'].append(completion)
  return output

In [None]:
def formatting_func(example):
  if example.get("context", "") != "":
      input_prompt = (f"Below is an instruction that describes a task, paired with an input that provides further context. "
      "Write a response that appropriately completes the request.\n\n"
      "### Instruction:\n"
      f"{example['instruction']}\n\n"
      f"### Input: \n"
      f"{example['context']}\n\n"
      f"### Response: \n"
      f"{example['response']}")

  else:
    input_prompt = (f"Below is an instruction that describes a task. "
      "Write a response that appropriately completes the request.\n\n"
      "### Instruction:\n"
      f"{example['instruction']}\n\n"
      f"### Response:\n"
      f"{example['response']}")

  return {"text" : input_prompt}

In [None]:
parsed_depression_data = parse_depression_dataset(depression_data['conversations'])
# depression_df = pd.DataFrame(parsed_depression_data)
depression_dataset = Dataset.from_dict(parsed_depression_data).train_test_split(test_size=0.1)
formatted_depression_dataset = depression_dataset.map(formatting_func)

## Training phase

#### Training utility methods

In [None]:
def bleu_rouge_f1(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=-1)

  labels = [[idx for idx in label if idx != -100] for label in labels]

  decoded_predictions = [tokenizer.decode(pred) for pred in predictions]
  decoded_labels = [tokenizer.decode(label) for label in labels]

  # print(f"Prediction: {decoded_predictions}\nLabel:{decoded_labels}\n")

  bleu = load("bleu")
  bleu_results = bleu.compute(predictions=decoded_predictions, references=decoded_labels)

  rouge = load('rouge')
  rouge_results = rouge.compute(predictions=decoded_predictions, references=decoded_labels)

  f1 = 2 * (bleu_results['bleu'] * rouge_results['rouge1']) / (bleu_results['bleu'] + rouge_results['rouge1'])

  scores = {
        "bleu": bleu_results["bleu"],
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
        "f1": f1
    }

  return scores


def train_model(model, formatted_dataset, metric):

  qlora_config = LoraConfig(
                            r=16,
                            lora_alpha=32,
                            lora_dropout=0.05,
                            bias="none",
                            task_type="CAUSAL_LM"
                          )

  supervised_finetuning_trainer = SFTTrainer(base_model,
                                            train_dataset=formatted_dataset["train"],
                                            eval_dataset=formatted_dataset["test"],
                                            args=transformers.TrainingArguments(
                                                per_device_train_batch_size=1,
                                                gradient_accumulation_steps=4,
                                                learning_rate=2e-4,
                                                max_steps=5000,
                                                output_dir="./wisai",
                                                optim="paged_adamw_8bit",
                                                fp16=True,
                                            ),
                                            tokenizer=tokenizer,
                                            peft_config=qlora_config,
                                            dataset_text_field="text",
                                            max_seq_length=512
                                        )

  supervised_finetuning_trainer.train()

  eval_result = supervised_finetuning_trainer.evaluate()

  return eval_result


def pretraining_prediction_scores(model, tokenized_dataset, data_collator, metric):

  qlora_config = LoraConfig(
                            r=16,
                            lora_alpha=32,
                            lora_dropout=0.05,
                            bias="none",
                            task_type="CAUSAL_LM"
                          )

  supervised_finetuning_trainer = SFTTrainer(base_model,
                                            train_dataset=formatted_dataset["train"],
                                            eval_dataset=formatted_dataset["test"],
                                            args=transformers.TrainingArguments(
                                                per_device_train_batch_size=1,
                                                gradient_accumulation_steps=4,
                                                learning_rate=2e-4,
                                                max_steps=5000,
                                                output_dir="./wisai",
                                                optim="paged_adamw_8bit",
                                                fp16=True,
                                            ),
                                            tokenizer=tokenizer,
                                            peft_config=qlora_config,
                                            dataset_text_field="text",
                                            max_seq_length=512,
                                            compute_metrics = metric
                                        )

  eval_result = supervised_finetuning_trainer.evaluate()

  return eval_result

#### Training


#### Experiments
1. Compare trained / untrained / small model results
2. Complete training on all datasets



In [None]:
# scores_pretrained_model = pretraining_prediction_scores(reference_base_model, tokenized_dataset, data_collator_seq2seq, bleu_rouge_f1)
# scores_pretrained_model

In [None]:
scores = train_model(base_model, tokenized_dataset, data_collator_seq2seq, bleu_rouge_f1)
scores

# Chatbot lauch

In [None]:
gen_config = GenerationConfig(
    do_sample=True,
    temperature=0.9,
    max_new_tokens=150,
    pad_token_id=tokenizer.eos_token_id,
    num_return_sequences=1
)

def predict(prompt):
    encoded_input = tokenizer(prompt, return_tensors='pt')
    input_length = len(encoded_input["input_ids"][0])
    output_ids = model.generate(generation_config=gen_config, **encoded_input)[0]
    output = tokenizer.decode(output_ids[input_length:], skip_special_tokens=True)
    return output

#gr.Interface(fn=predict, inputs="text", outputs="text").launch()
print(predict("What is Depression?"))

# Saving model components to Huggingface

In [None]:
# token = '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'
model.push_to_hub("wisai", use_auth_token=token)
# gen_config.push_to_hub("wisai", "generation_config.json", use_auth_token=token)