<a href="https://colab.research.google.com/github/vitaliy-sharandin/data_science_projects/blob/master/nlp/gpt/wisai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WisAI
### WisAI model is a GPT-NeoX-20B model fine-tuned on philosophical and psychological data and configured to provide useful advice.

In [1]:
!pip install gradio
!pip install transformers
!pip install datasets
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.32.0-py3-none-any.whl (19.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.9/19.9 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles (from gradio)
  Downloading aiofiles-23.1.0-py3-none-any.whl (14 kB)
Collecting aiohttp (from gradio)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi (from gradio)
  Downloading fastapi-0.95.2-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.0/57.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client>

In [2]:
from google.colab import drive
import pandas as pd
import json
import yaml
import gradio as gr
import torch
from transformers import GenerationConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset, concatenate_datasets
from torch.utils.data import random_split

In [54]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")

torch.manual_seed(42)

<torch._C.Generator at 0x7eff8ba058b0>

# Training

## Training datasets list

### Psychology and mental health datasets

#### Text datasets


* Kaggle Psychometrics dataset https://www.kaggle.com/discussions/general/304994
* Psychometric tests dataset https://ieee-dataport.org/documents/psychometric-tests-dataset
* Psychometric NLP https://paperswithcode.com/dataset/psychometric-nlp
* Reddit mental health dataset https://zenodo.org/record/3941387
* Reddit mental disorders identification https://www.kaggle.com/datasets/kamaruladha/mental-disorders-identification-reddit-nlp
* Kaggle Mental Health Conversational Data https://www.kaggle.com/datasets/elvis23/mental-health-conversational-data
* Kaggle Mental Health FAQ for Chatbot https://www.kaggle.com/narendrageek/mental-health-faq-for-chatbot/code
* A human consciousness questionnaire dataset https://data.mendeley.com/datasets/69p62ksdh6
* paperswithcode Self-reported Mental Health Diagnoses https://paperswithcode.com/dataset/smhd
* paperswithcode Mental Health Summarization Dataset https://paperswithcode.com/dataset/mentsum
* HuggingFace psychology dataset https://huggingface.co/datasets/samhog/psychology-10k

#### Text2Text datasets 
* Kaggle Depression data for chatbot https://www.kaggle.com/datasets/nupurgopali/depression-data-for-chatbot

#### Classification datasets
* Classification for mental health https://www.kaggle.com/datasets/reihanenamdari/mental-health-corpus
* Depression identification https://www.kaggle.com/datasets/infamouscoder/depression-reddit-cleaned

### Philosophy datasets
* https://www.kaggle.com/datasets/christopherlemke/philosophical-texts
* https://www.workwithdata.com/object/philosophy-science-complete-a-text-on-traditional-problems-schools-thought-book-by-edwin-h-c-hung-0000
* https://www.kaggle.com/datasets/christopherlemke/philosophy-authors-writings-german
* https://www.workwithdata.com/object/philosophical-inquiries-an-introduction-to-problems-philosophy-book-by-nicholas-rescher-0000
* https://www.workwithdata.com/object/roman-stoicism-book-by-edward-vernon-arnold-1857
* https://www.workwithdata.com/object/wisdom-energy-basic-buddhist-teachings-book-by-thubten-yeshe-1935

## Training dataset creation

#### Kaggle depression dataset

In [4]:
drive.mount('/content/drive')

depression_data = []

with open('/content/drive/MyDrive/Data/depression.yml', 'r') as file:
     depression_data = yaml.safe_load(file)

Mounted at /content/drive


In [5]:
def parse_depression_dataset(conversations):
  output = {'prompt':[],'completion':[]}
  for convo in conversations:
    completion = ''
    for i, dialog in enumerate(convo):
      if i == 0:
        prompt = dialog
        # p_encode = prompt.encode("ascii", "ignore")
        # prompt = p_encode.decode()
        prompt = prompt.replace("\xa0", " ")
        # print('prompt:',prompt)
      else:
        completion += " " + dialog
        # c_encode = completion.encode("ascii", "ignore")
        # completion = c_encode.decode()
        completion = completion.replace("\xa0", " ")
    completion = completion.strip()
    # print(line)
    output['prompt'].append(prompt)
    output['completion'].append(completion)
  return output

In [59]:
parsed_depression_data = parse_depression_dataset(depression_data['conversations'])
depression_df = pd.DataFrame(parsed_depression_data)
depression_dataset = Dataset.from_dict(depression_df)

def tokenize_prompt_completion_element_no_overflow(element):
  prompt = element["prompt"]
  completion = element["completion"]
  
  prompt_completion_string = f"{tokenizer.bos_token}{prompt}\n{completion}{tokenizer.eos_token}"
  prompt_string = f"{tokenizer.bos_token}{prompt}\n"
  completion_string = f"{completion}{tokenizer.eos_token}"
  
  prompt_completion_tokens = tokenizer(prompt_completion_string)["input_ids"]
  prompt_tokens = tokenizer(prompt_string)["input_ids"]
  
  completion_tokens = tokenizer(completion_string)["input_ids"]
  completion_tokens = [-100] * len(prompt_tokens) + completion_tokens

  # print(len(prompt_completion_tokens))
  # print(len(prompt_tokens))
  # print(len(completion_tokens))

  return {"input_ids": prompt_completion_tokens, "labels": completion_tokens}

def tokenize_dataset_no_overflow(dataset):
  tokenized_no_overflow_dataset = dataset.map(tokenize_prompt_completion_element_no_overflow, remove_columns=dataset.column_names)
  return tokenized_no_overflow_dataset.train_test_split(test_size=0.2)

tokenized_dataset = tokenize_dataset_no_overflow(depression_dataset)

from transformers import DataCollatorForSeq2Seq
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

# data_collator([tokenized_dataset["train"][i] for i in range(5)])



Map:   0%|          | 0/51 [00:00<?, ? examples/s]

In [None]:
def tokenize_prompt_completion_element_overflow(element):

  prompt = element["prompt"]
  completion = element["completion"]

  prompt_completion_string = f"{tokenizer.bos_token}{prompt}\n{completion}{tokenizer.eos_token}"
  prompt_string = f"{tokenizer.bos_token}{prompt}\n"
  completion_string = f"{completion}{tokenizer.eos_token}"

  prompt_completion_tokens = tokenizer(prompt_completion_string,
                                      truncation=True,
                                      return_overflowing_tokens=True,
                                      return_length=True,
                                      max_length=4,
                                      stride=2)

  prompt_tokens = tokenizer(prompt_string, 
                            truncation=True,
                            return_overflowing_tokens=True,
                            return_length=True,
                            max_length=4,
                            stride=2)
    
  completion_tokens = tokenizer(completion_string, 
                                truncation=True,
                                return_overflowing_tokens=True,
                                return_length=True,
                                max_length=4,
                                stride=2)
  
  # try to flatten, substitute and then reshape the lists back

  print('\n'.join([tokenizer.decode(sublist) for sublist in prompt_completion_tokens["input_ids"]]))
  print('\n'.join([tokenizer.decode(sublist) for sublist in prompt_tokens["input_ids"]]))
  print('\n'.join([tokenizer.decode(sublist) for sublist in completion_tokens["input_ids"]]))

  print(prompt_completion_tokens)


  # print(tokenizer.decode(prompt_tokens))
  # print(completion_tokens[len(prompt_tokens):])

  # bos_token_id = tokenizer.bos_token
  # eos_token_id = tokenizer.eos_token_id
  
  # # How to solve concatenation of list of lists for prompt_tokens and completion_tokens in this case???????????????????????????????????????
  # input_ids = [bos_token_id] + prompt_tokens["input_ids"] + [eos_token_id] + completion_tokens["input_ids"]
  # labels = [-100] + [-100] * len(prompt_tokens["input_ids"]) + [-100] + completion_tokens["input_ids"]
  
  # # How to solve not knowing the length of prompt list of list in this case????????????????????????????????????????????????
  # input_ids = []
  # labels = []
  # for single_input_ids in tokens.input_ids:
  #     # add bos token at start, eos between prompt and completion.
  #     formatted_input_ids = [bos_token_id] + single_input_ids + [eos_token_id]
  #     input_ids.append(formatted_input_ids)
      
  #     # create labels
  #     prompt_length = len(single_input_ids) # this may be changed based on your specific needs
  #     formatted_labels = [-100] + [-100] * prompt_length + single_input_ids[prompt_length:]
  #     labels.append(formatted_labels)
  # return {"input_ids": input_ids, "labels": labels}

## Training phase

In [60]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=500,
    logging_steps=100,
    evaluation_strategy="epoch",
    do_eval=True,
    logging_dir="./logs",
)

# Custom training function
def train_model(model, training_args):
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        data_collator=data_collator,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['test']
    )
    
    def compute_metrics(eval_pred):
        # Add your custom evaluation metrics here
        return {"eval_loss": eval_pred.loss}
    
    trainer.compute_metrics = compute_metrics
    
    # Train the model
    trainer.train()
    
    # Evaluate the model
    eval_result = trainer.evaluate()
    
    # Check the evaluation score
    eval_loss = eval_result["eval_loss"]
    # Perform your desired score check logic here
    
    return eval_loss

# Training the model
eval_loss = train_model(model, training_args)
eval_loss



# Chatbot lauch

In [None]:
gen_config = GenerationConfig(
    do_sample=True,
    temperature=0.9,
    max_new_tokens=150,
    pad_token_id=tokenizer.eos_token_id,
    num_return_sequences=1
)

def predict(prompt):
    encoded_input = tokenizer(prompt, return_tensors='pt')
    input_length = len(encoded_input["input_ids"][0])
    output_ids = model.generate(generation_config=gen_config, **encoded_input)[0]
    output = tokenizer.decode(output_ids[input_length:], skip_special_tokens=True)
    return output

#gr.Interface(fn=predict, inputs="text", outputs="text").launch()
print(predict("Hello, AI."))

# Saving model components to Huggingface

In [None]:
# token = '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'
# model.push_to_hub("wisai", use_auth_token=token)
# gen_config.push_to_hub("wisai", "generation_config.json", use_auth_token=token)