<a href="https://colab.research.google.com/github/vitaliy-sharandin/data_science_projects/blob/master/nlp/gpt/wisai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WisAI
### WisAI model is a GPT-NeoX-20B model fine-tuned on philosophical and psychological data and configured to provide useful advice.

In [1]:
!pip install gradio
!pip install transformers
!pip install datasets
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
import pandas as pd
import json
import yaml
import gradio as gr
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, GenerationConfig, Trainer, TrainingArguments
from datasets import Dataset, concatenate_datasets
from torch.utils.data import random_split

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")

model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")

torch.manual_seed(42)

<torch._C.Generator at 0x7fd28c47a4f0>

# Training

## Training datasets list

### Psychology and mental health datasets

#### Text datasets


* Kaggle Psychometrics dataset https://www.kaggle.com/discussions/general/304994
* Psychometric tests dataset https://ieee-dataport.org/documents/psychometric-tests-dataset
* Psychometric NLP https://paperswithcode.com/dataset/psychometric-nlp
* Reddit mental health dataset https://zenodo.org/record/3941387
* Reddit mental disorders identification https://www.kaggle.com/datasets/kamaruladha/mental-disorders-identification-reddit-nlp
* Kaggle Mental Health Conversational Data https://www.kaggle.com/datasets/elvis23/mental-health-conversational-data
* Kaggle Mental Health FAQ for Chatbot https://www.kaggle.com/narendrageek/mental-health-faq-for-chatbot/code
* A human consciousness questionnaire dataset https://data.mendeley.com/datasets/69p62ksdh6
* paperswithcode Self-reported Mental Health Diagnoses https://paperswithcode.com/dataset/smhd
* paperswithcode Mental Health Summarization Dataset https://paperswithcode.com/dataset/mentsum
* HuggingFace psychology dataset https://huggingface.co/datasets/samhog/psychology-10k

#### Text2Text datasets 
* Kaggle Depression data for chatbot https://www.kaggle.com/datasets/nupurgopali/depression-data-for-chatbot

#### Classification datasets
* Classification for mental health https://www.kaggle.com/datasets/reihanenamdari/mental-health-corpus
* Depression identification https://www.kaggle.com/datasets/infamouscoder/depression-reddit-cleaned

### Philosophy datasets
* https://www.kaggle.com/datasets/christopherlemke/philosophical-texts
* https://www.workwithdata.com/object/philosophy-science-complete-a-text-on-traditional-problems-schools-thought-book-by-edwin-h-c-hung-0000
* https://www.kaggle.com/datasets/christopherlemke/philosophy-authors-writings-german
* https://www.workwithdata.com/object/philosophical-inquiries-an-introduction-to-problems-philosophy-book-by-nicholas-rescher-0000
* https://www.workwithdata.com/object/roman-stoicism-book-by-edward-vernon-arnold-1857
* https://www.workwithdata.com/object/wisdom-energy-basic-buddhist-teachings-book-by-thubten-yeshe-1935

## Training dataset creation

#### Kaggle depression dataset

In [4]:
drive.mount('/content/drive')

depression_data = []

with open('/content/drive/MyDrive/Data/depression.yml', 'r') as file:
     depression_data = yaml.safe_load(file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
def parse_depression_dataset(conversations):
  output = {'prompt':[],'completion':[]}
  for convo in conversations:
    completion = ''
    for i, dialog in enumerate(convo):
      if i == 0:
        prompt = dialog
        # p_encode = prompt.encode("ascii", "ignore")
        # prompt = p_encode.decode()
        prompt = prompt.replace("\xa0", " ")
        # print('prompt:',prompt)
      else:
        completion += " " + dialog
        # c_encode = completion.encode("ascii", "ignore")
        # completion = c_encode.decode()
        completion = completion.replace("\xa0", " ")
    completion = completion.strip()
    # print(line)
    output['prompt'].append(prompt)
    output['completion'].append(completion)
  return output

In [9]:
parsed_depression_data = parse_depression_dataset(depression_data['conversations'])
depression_df = pd.DataFrame(parsed_depression_data)

def tokenize_text2text_df(row):

    prompt_tokens = tokenizer(row["prompt"], 
                              truncation=True,
                              return_overflowing_tokens=True,
                              return_length=True, 
                              max_length=128)
    
    completion_tokens = tokenizer(row["completion"], 
                                  truncation=True,
                                  return_overflowing_tokens=True,
                                  return_length=True,
                                  max_length=128)
    
    input_ids = prompt_tokens["input_ids"] + completion_tokens["input_ids"]
    labels = [-100] * len(prompt_tokens["input_ids"]) + completion_tokens["input_ids"]
    
    row["input_ids"] = input_ids
    row["labels"] = labels

    return row

# Tokenize dataframe
tokenized_df = depression_df.apply(tokenize_text2text_df, axis=1)
tokenized_df = tokenized_df.drop(['prompt', 'completion'], axis=1)

# Dataset creation
depression_dataset = Dataset.from_pandas(tokenized_df)

# Train test split
train_size = int(0.8 * len(depression_dataset))
test_size = len(depression_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(depression_dataset, [train_size, test_size])

## Training phase

In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=500,
    logging_steps=100,
    evaluation_strategy="epoch",
    do_eval=True,
    logging_dir="./logs",
)

# Custom training function
def train_model(model, training_args):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,  # Replace with your training dataset
        eval_dataset=test_dataset,    # Replace with your evaluation dataset
    )
    
    def compute_metrics(eval_pred):
        # Add your custom evaluation metrics here
        return {"eval_loss": eval_pred.loss}
    
    trainer.compute_metrics = compute_metrics
    
    # Train the model
    trainer.train()
    
    # Evaluate the model
    eval_result = trainer.evaluate()
    
    # Check the evaluation score
    eval_loss = eval_result["eval_loss"]
    # Perform your desired score check logic here
    
    return eval_loss

# Training the model
eval_loss = train_model(model, training_args)




# Chatbot lauch

In [None]:
gen_config = GenerationConfig(
    do_sample=True,
    temperature=0.9,
    max_new_tokens=150,
    pad_token_id=tokenizer.eos_token_id,
    num_return_sequences=1
)

def predict(prompt):
    encoded_input = tokenizer(prompt, return_tensors='pt')
    input_length = len(encoded_input["input_ids"][0])
    output_ids = model.generate(generation_config=gen_config, **encoded_input)[0]
    output = tokenizer.decode(output_ids[input_length:], skip_special_tokens=True)
    return output

#gr.Interface(fn=predict, inputs="text", outputs="text").launch()
print(predict("Hello, AI."))

# Saving model components to Huggingface

In [None]:
# token = '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'
# model.push_to_hub("wisai", use_auth_token=token)
# gen_config.push_to_hub("wisai", "generation_config.json", use_auth_token=token)