In [None]:
!pip install transformers
!pip install datasets
!pip install nltk
!pip install python-dotenv
!pip install scikit-learn
!pip install sacrebleu
!pip install rouge_score
!pip install accelerate
!pip install torch
!pip install h5py
!pip install datasketch
!pip install revChatGPT
!pip install evaluate
!pip install openai

In [108]:
import json
import os
import random
def concatenate_json_lists(directory_path, target_length, output_file_path):
    concatenated_list = []
    files_list = os.listdir(directory_path)
    random.shuffle(files_list)
    for filename in files_list:
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)
            
            with open(file_path, "r",encoding="utf-8") as file:
                data = json.load(file)
                
                if isinstance(data, list):
                    concatenated_list.extend(data)
                    
                    if len(concatenated_list) >= target_length:
                        break
    
    if len(concatenated_list) > 0:
        concatenated_list = concatenated_list[:target_length]
        
        with open(output_file_path, "w",encoding="utf-8") as output_file:
            json.dump(concatenated_list, output_file, indent=2,ensure_ascii=False)
            print(f"Concatenated list written to {output_file_path}")
    else:
        print("No data to concatenate or target length not reached.")

In [109]:
directory_path = "/kaggle/input/french-conversational-dataset/train"
target_length = 100000
output_file_path = "/kaggle/working/train.json"
concatenate_json_lists(directory_path, target_length, output_file_path)

Concatenated list written to /kaggle/working/train.json


In [110]:
directory_path = "/kaggle/input/french-conversational-dataset/val"
target_length = int((1e5/6)*2)
output_file_path = "/kaggle/working/validation.json"
concatenate_json_lists(directory_path, target_length, output_file_path)

Concatenated list written to /kaggle/working/validation.json


In [111]:
directory_path = "/kaggle/input/french-conversational-dataset/test"
target_length = int((1e5/6)*2)
output_file_path = "/kaggle/working/test.json"
concatenate_json_lists(directory_path, target_length, output_file_path)

Concatenated list written to /kaggle/working/test.json


In [122]:
!mkdir official_model

In [113]:
CONTEXT_LEN = 1600
NO_CONTEXT_LEN = 800

In [114]:
%%file /kaggle/working/config.json
{
"save_path": "/kaggle/working/official_model/",
"length": 1600,
"train":"/kaggle/working/train.json",
"dev":"/kaggle/working/validation.json",
"test":"/kaggle/working/test.json",
"batch_size": 4,
"batch_size_eval":16,
"epoch":10,
"lr":2e-5,
"weight_decay":1e-3,
"gpu":-1,
"gradient_accumulation_steps":32,
"optim":"adamw_torch"
}

Overwriting /kaggle/working/config.json


In [115]:
%%file /kaggle/working/train.py
from datasets.features.features import pa
import numpy as np
import os
import nltk
import argparse
from datasets import load_dataset
import accelerate
import evaluate
import torch
import json
nltk.download('punkt')
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments,TrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

os.environ["TOKENIZERS_PARALLELISM"] = "false"
parser = argparse.ArgumentParser(description='Transformation my dataset to group dataset type')
parser.add_argument('--config', type=str,default="",
                    help='config file path ')
args = parser.parse_args()

with open(args.config, "rb") as f:
  config = json.load(f)

if len(list(os.listdir(config["save_path"])))>0:
  tokenizer = AutoTokenizer.from_pretrained(os.path.join(config["save_path"],os.listdir(config["save_path"])[0]))
  model = AutoModelForSeq2SeqLM.from_pretrained(os.path.join(config["save_path"],os.listdir(config["save_path"])[0]))

else:
  tokenizer = AutoTokenizer.from_pretrained("microsoft/GODEL-v1_1-base-seq2seq")
  model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/GODEL-v1_1-base-seq2seq")


def tokenize_function(examples):

    instruction_k = f'Instruction: given a dialog context and related knowledge, you need to response safely based on the knowledge.'
    instruction_nk = f'Instruction: given a dialog context, you need to response empathically.'
    inputs = [f"{instruction_k} [CONTEXT] {' EOS '.join(dialog)} [KNOWLEDGE] {knowledge}" if knowledge != "" else\
              f"{instruction_nk} [CONTEXT] {' EOS '.join(dialog)}" for dialog, knowledge in zip(examples["context"], examples["knowledge"])]
    targets = [ex for ex in examples["response"]]
    model_inputs = tokenizer(inputs, max_length=config["length"], truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target= targets, max_length=config["length"], truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    
    
    return model_inputs

class f1:
  def compute(self,predictions, references, type = 'marco'):
    f1s =[]
    precisions = []
    recalls = []
    for i in range(len(predictions)):
      precision = 0
      recall = 0
      for j in " ".join(predictions[i]).split():
        if j in " ".join(references[i][0]).split():
          precision += 1
      for j in " ".join(references[i][0]).split():
        if j in " ".join(predictions[i]).split():
          recall += 1
      p = precision/(len(" ".join(predictions[i]).split())+1)
      r = recall/(len(" ".join(references[i][0]).split())+1)
      e = (1e-5)/(len(" ".join(predictions[i]).split())+len(" ".join(references[i][0]).split())+2)
      precisions.append(p)
      recalls.append(r)
      f1s.append(2*p*r*(p+r)/((p+r)**2 +e**2))
    if type == 'micro':
      return {'f1': sum(f1s)/len(f1s)}
    if type == 'marco':
      e_a = (1e-5)/(len(precisions)+len(recalls))
      p_a = sum(precisions)/len(precisions)
      r_a = sum(recalls)/len(recalls)
      return {'f1': 2*p_a*r_a*(p_a+r_a)/((p_a+r_a)**2 +e_a**2)}



def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0].argmax(axis = -1)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = [["\n".join(nltk.sent_tokenize(label.strip()))] for label in decoded_labels]

    # Metric
    #rouge
    result2 = metric2.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    #bleu
    result1 = metric1.compute(predictions=decoded_preds, references=decoded_labels)
    result2["bleu"] = result1["bleu"]
    #meteor
    result3 = metric3.compute(predictions=decoded_preds, references=decoded_labels)
    result2["meteor"] = result3["meteor"]
    #perplexity
    result4 = metric4.compute(predictions=decoded_preds, model_id='gpt2')
    result2["perplexity"] = result4['mean_perplexity']
    #f1
    result5 = metric5.compute(predictions=decoded_preds, references=decoded_labels)
    result2['f1'] = result5['f1']

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result2["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result2.items()}



if __name__  == "__main__":
    root = os.getcwd()
    data_files = {"train": config["train"], "validation":config["dev"],"test":config["test"]}
    dataset = load_dataset("json", data_files=data_files)
    metric1 = evaluate.load("bleu")
    metric2 = evaluate.load("rouge")
    metric3 = evaluate.load("meteor")
    metric4 = evaluate.load("perplexity", module_type="metric")
    metric5 = f1()

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    training_args = Seq2SeqTrainingArguments(output_dir=config["save_path"],
                                  report_to="tensorboard",
                                  load_best_model_at_end = True,
                                  save_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=config["batch_size"],
                                  per_device_eval_batch_size=config["batch_size_eval"],
                                  dataloader_num_workers=2,
                                  fp16=True,
                                  save_total_limit=1,
                                  logging_strategy="epoch",
                                  predict_with_generate=True,
                                  num_train_epochs=config["epoch"],
                                  learning_rate=config["lr"],
                                  weight_decay=config["weight_decay"],
		  local_rank = config["gpu"],
		  torch_compile = True,
		  optim = config["optim"],
		  gradient_accumulation_steps = config["gradient_accumulation_steps"]
)
    trainer =  Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    trainer.train()
    trainer.save_model(config["save_path"]+"/model-v1.0.0")
    with open(config["save_path"]+"/test_results.txt","w") as f:
         f.write(str(trainer.evaluate(tokenized_datasets["test"])))

Overwriting /kaggle/working/train.py


In [33]:
# %%file /kaggle/working/transform.py
# import json
# import argparse

# parser = argparse.ArgumentParser(description='Transformation my dataset to group dataset type')
# parser.add_argument('--filepath', type=str,default="train.json",
#                     help='transformation file path ')
# parser.add_argument('--save_file', type=str,default="train.json",
#                     help='transformation saving file path ')
# parser.add_argument('--type', type=str,default="conv",
#                     help='transformation saving file path ')

# args = parser.parse_args()

# if __name__ == "__main__":
#   filepath = args.filepath
#   with open(filepath,"rb") as f:
#     data =json.load(f)
#   if args.type == "conv":
#     transformed_train = []
#     for da in data:
#         data_point = {}
#         for i, context in enumerate(da['context']):
#             data_point["context" +str(i)] =context
#         data_point['response'] = da['response']
#         transformed_train.append(data_point)
#   elif args.type == "dia":
#     transformed_train = []
#     for da in data:
#       try:
#         transformed_train.append({"dialog": list(da.values())[:-1] , "knowledge": "", "response": da["response"]})
#       except Exception:
#         continue
# with open(args.save_file, "w", encoding="utf-8") as f:
#     print(len(transformed_train))
#     for obj in transformed_train:
#         json.dump(obj, f, ensure_ascii=False)
#         f.write('\n')

Overwriting /kaggle/working/transform.py


In [116]:
%%file /kaggle/working/transform.py
import json
import argparse

parser = argparse.ArgumentParser(description='Transformation my dataset to group dataset type')
parser.add_argument('--filepath', type=str,default="train.json",
                    help='transformation file path ')
parser.add_argument('--save_file', type=str,default="train.json",
                    help='transformation saving file path ')
parser.add_argument('--type', type=str,default="conv",
                    help='transformation saving file path ')

args = parser.parse_args()

if __name__ == "__main__":
  filepath = args.filepath
  with open(filepath,"rb") as f:
    data =json.load(f)
#   if args.type == "conv":
#     transformed_train = []
#     for da in data:
#         data_point = {}
#         for i, context in enumerate(da['context']):
#             data_point["context" +str(i)] =context
#         data_point['response'] = da['response']
#         transformed_train.append(data_point)
#   elif args.type == "dia":
#     transformed_train = []
#     for da in data:
#       try:
#         transformed_train.append({"dialog": list(da.values())[:-1] , "knowledge": "", "response": da["response"]})
#       except Exception:
#         continue
with open(args.save_file, "w", encoding="utf-8") as f:
    print(len(data))
    for obj in data:
        json.dump(obj, f, ensure_ascii=False)
        f.write('\n')

Overwriting /kaggle/working/transform.py


In [117]:
!python /kaggle/working/transform.py --filepath /kaggle/working/train.json --save_file /kaggle/working/train.json

100000


In [118]:
!python /kaggle/working/transform.py --filepath /kaggle/working/test.json --save_file /kaggle/working/test.json

33333


In [119]:
!python /kaggle/working/transform.py --filepath /kaggle/working/validation.json --save_file /kaggle/working/validation.json

33333


In [None]:
!pip install -U nltk

In [None]:
!python /kaggle/working/train.py --config /kaggle/working/config.json

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-bcaa9012921f9394/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...
Downloading data files: 100%|███████████████████| 3/3 [00:00<00:00, 5329.48it/s]
Extracting data files: 100%|████████████████████| 3/3 [00:00<00:00, 1022.92it/s]
Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-bcaa9012921f9394/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e