In [None]:
%pip install transformers[torch] tokenizers datasets evaluate rouge_score sentencepiece huggingface_hub nltk pandas numpy adapters scikit-learn 

In [1]:
import json
from argparse import ArgumentParser
from tqdm import tqdm
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_dataset, Dataset, DatasetDict
from datasets import concatenate_datasets
import evaluate
import nltk
import numpy as np
import pandas as pd
import random
from nltk.tokenize import sent_tokenize
from transformers import DataCollatorForSeq2Seq, pipeline
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from typing import DefaultDict
import time
import torch
from adapters import init, Seq2SeqAdapterTrainer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import re
from sklearn.model_selection import train_test_split

2024-01-04 15:20:27.730587: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-04 15:20:27.730710: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-04 15:20:27.745123: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-04 15:20:27.770572: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/skystream/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
model_name = 'google/flan-t5-base'
tokenizer_name = 'google/flan-t5-base'

# MultiWOZ Intent Dataprep

In [4]:
dataset_name = "multi_woz_v22"

In [5]:
dataset = load_dataset(dataset_name)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 8437
    })
    validation: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 1000
    })
})

In [7]:
target_intents = []
for element in dataset:
    possible_intents = []
    input_texts = []
    for entry in dataset[element]:
        for turn in range(len(entry['turns']['turn_id'])):
            input_text = ''
            if entry['turns']['speaker'][turn] == 0: 
                #print("The user says: " + entry['turns']['utterance'][turn])
                input_text += f"The user says: {entry['turns']['utterance'][turn]}\n"
                # TBD: What intents could there be? 
                num_of_states = len(entry['turns']['frames'][turn]['state'])
                if num_of_states == 0:
                    intent = 'none'
                else:
                    intent = entry['turns']['frames'][turn]['state'][0]['active_intent']
                if 'find' in intent: 
                    m = re.search('find\_(.*)', intent)
                    target_intent = m.group(1)
                    #print("Question: Did the user intend to talk about finding some " + str(target_intent) + "?")
                    input_text += f"Question: Did the user intend to talk about finding some {target_intent} ?\n"
                    target_intents.append(target_intent)
                if 'book' in intent: 
                    m = re.search('book\_(.*)', intent)
                    target_intent = m.group(1)
                    #print("Question: Did the user intend to talk about booking some " + str(target_intent) + "?")
                    input_text += f"Question: Did the user intend to talk about booking some {target_intent} ?\n"
                    target_intents.append(target_intent)
                    
                possible_intents.append(intent)
                input_texts.append(input_text)
    if element == 'train':
        train_df = pd.DataFrame(input_texts, columns=['input'])
    elif element == 'validation':
        val_df = pd.DataFrame(input_texts, columns=['input'])
    else:
        test_df = pd.DataFrame(input_texts, columns=['input'])

In [8]:
#add labels
train_df['labels'] = 'yes'
val_df['labels'] = 'yes'
test_df['labels'] = 'yes'

In [9]:
#add true negatives
train_df_tp, train_df_tn = train_test_split(train_df, test_size=0.3, random_state=42)
val_df_tp, val_df_tn = train_test_split(val_df, test_size=0.3, random_state=42)
test_df_tp, test_df_tn = train_test_split(test_df, test_size=0.3, random_state=42)

In [10]:
len(train_df_tp), len(train_df_tn), len(val_df_tp), len(val_df_tn), len(test_df_tp), len(test_df_tn)

(39743, 17033, 5161, 2213, 5160, 2212)

In [11]:
unique_intents = set(target_intents)

In [12]:
# tn for train
for index, entry in train_df_tn.iterrows():
    new_input_text= ''
    sentence_intent = entry['input'].split()[-2]
    new_input_text += entry['input'].split(sep='\n')[0] + '\n'
    #print(sentence_intent)
    if re.match('find\_(.*)', entry['input']):
        #print(f"Question: Did the user intend to talk about booking some {random.choice(list(unique_intents))} ?\n")
        new_input_text += f"Question: Did the user intend to talk about booking some {random.choice(list(unique_intents))} ?\n"
    else:
        #print(f"Question: Did the user intend to talk about finding some {random.choice(list(unique_intents))} ?\n")
        new_input_text += f"Question: Did the user intend to talk about finding some {random.choice(list(unique_intents))} ?\n"
    train_df_tn.at[index, 'input'] = new_input_text

# tn for val
for index, entry in val_df_tn.iterrows():
    new_input_text= ''
    sentence_intent = entry['input'].split()[-2]
    new_input_text += entry['input'].split(sep='\n')[0] + '\n'
    #print(sentence_intent)
    if re.match('find\_(.*)', entry['input']):
        #print(f"Question: Did the user intend to talk about booking some {random.choice(list(unique_intents))} ?\n")
        new_input_text += f"Question: Did the user intend to talk about booking some {random.choice(list(unique_intents))} ?\n"
    else:
        #print(f"Question: Did the user intend to talk about finding some {random.choice(list(unique_intents))} ?\n")
        new_input_text += f"Question: Did the user intend to talk about finding some {random.choice(list(unique_intents))} ?\n"
    val_df_tn.at[index, 'input'] = new_input_text

# tn for test
for index, entry in test_df_tn.iterrows():
    new_input_text= ''
    sentence_intent = entry['input'].split()[-2]
    new_input_text += entry['input'].split(sep='\n')[0] + '\n'
    #print(sentence_intent)
    if re.match('find\_(.*)', entry['input']):
        #print(f"Question: Did the user intend to talk about booking some {random.choice(list(unique_intents))} ?\n")
        new_input_text += f"Question: Did the user intend to talk about booking some {random.choice(list(unique_intents))} ?\n"
    else:
        #print(f"Question: Did the user intend to talk about finding some {random.choice(list(unique_intents))} ?\n")
        new_input_text += f"Question: Did the user intend to talk about finding some {random.choice(list(unique_intents))} ?\n"
    test_df_tn.at[index, 'input'] = new_input_text

In [13]:
train_df_tn['labels'] = 'no'
val_df_tn['labels'] = 'no'
test_df_tn['labels'] = 'no'

In [14]:
train_df_tn.head()

Unnamed: 0,input,labels
30033,"The user says: Hello, I am looking for a place...",no
9712,"The user says: that sounds good, i will need a...",no
49004,The user says: I am planning a trip in Cambrid...,no
27051,The user says: Thank you for all your help. I ...,no
10241,The user says: What was their price range agai...,no


In [15]:
final_train_df = pd.concat([train_df_tp, train_df_tn])
final_val_df = pd.concat([val_df_tp, val_df_tn])
final_test_df = pd.concat([test_df_tp, test_df_tn])

In [16]:
final_train_df.sample(10)

Unnamed: 0,input,labels
52433,The user says: Just one ticket please and than...,yes
18212,"The user says: Yes, it will be fine. I need ti...",yes
42433,"The user says: Just myself, at 14:30 on Thursd...",no
28807,"The user says: Yes, I am looking for a train l...",yes
35401,The user says: Could I get the phone number an...,yes
41378,"The user says: No, thanks. You answered all my...",no
1867,"The user says: Yes, I'd like to book for Thurs...",yes
36582,"The user says: Opps no, that was an error on m...",no
28782,The user says: It doesn't matter the part of t...,no
44640,The user says: The train should leave after 08...,no


In [17]:
processed_intent_dataset = DatasetDict({
    "train": Dataset.from_pandas(final_train_df),
    "validation": Dataset.from_pandas(final_val_df),
    "test": Dataset.from_pandas(final_test_df)
    })

In [18]:
processed_intent_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'labels', '__index_level_0__'],
        num_rows: 56776
    })
    validation: Dataset({
        features: ['input', 'labels', '__index_level_0__'],
        num_rows: 7374
    })
    test: Dataset({
        features: ['input', 'labels', '__index_level_0__'],
        num_rows: 7372
    })
})

# Multi2WOZ Intent Dataprep

In [19]:
with open('multi2woz_dataset/test_full_de.json', 'r') as f:
  test_data = json.load(f)

with open('multi2woz_dataset/val_full_de.json', 'r') as f:
  val_data = json.load(f)

In [20]:
list_of_keys_test = []
list_of_values_test = []

for key in test_data:
        id = key
        list_of_keys_test.append(id)
        value = test_data[key]
        #print("The key and value are ({}) = ({})".format(key, value))
        for key in value:
            #print("The key and value are ({}) = ({})".format(key, value[key]))
            inhalt = value[key]
            list_of_values_test.append(inhalt)

list_of_keys_val = []
list_of_values_val = []

for key in val_data:
        id = key
        list_of_keys_val.append(id)
        value = val_data[key]
        #print("The key and value are ({}) = ({})".format(key, value))
        for key in value:
            #print("The key and value are ({}) = ({})".format(key, value[key]))
            inhalt = value[key]
            list_of_values_val.append(inhalt)

In [21]:
clean_list_of_values_test = []
for element in list_of_values_test:
    only_text = []
    for text in element:
        #print(text['text'])
        only_text.append(text['text'])
    clean_list_of_values_test.append(only_text)

clean_list_of_values_val = []
for element in list_of_values_val:
    only_text = []
    for text in element:
        #print(text['text'])
        only_text.append(text['text'])
    clean_list_of_values_val.append(only_text)

In [22]:
multi2_df_test = pd.DataFrame({'id':list_of_keys_test,'text':clean_list_of_values_test})
multi2_df_val = pd.DataFrame({'id':list_of_keys_val,'text':clean_list_of_values_val})

In [23]:
multiwoz_test = load_dataset(dataset_name, split='test')
multiwoz_val = load_dataset(dataset_name, split='validation')

In [24]:
list_of_d_ids_test = []
list_of_services_test = []
list_of_turns_test = []
for entry in multiwoz_test:
    d_id = entry['dialogue_id']
    list_of_d_ids_test.append(d_id)
    service = entry['services']
    list_of_services_test.append(service)
    turns = entry['turns']
    list_of_turns_test.append(turns)

list_of_d_ids_val = []
list_of_services_val = []
list_of_turns_val = []
for entry in multiwoz_val:
    d_id = entry['dialogue_id']
    list_of_d_ids_val.append(d_id)
    service = entry['services']
    list_of_services_val.append(service)
    turns = entry['turns']
    list_of_turns_val.append(turns)

In [25]:
multiwoz_df_test = pd.DataFrame({'id':list_of_d_ids_test,'services':list_of_services_test,'turns':list_of_turns_test})
multiwoz_df_val = pd.DataFrame({'id':list_of_d_ids_val,'services':list_of_services_val,'turns':list_of_turns_val})

In [26]:
full_df_test = pd.merge(multiwoz_df_test, multi2_df_test,on='id')
full_df_val = pd.merge(multiwoz_df_val, multi2_df_val,on='id')

In [27]:
full_df_test.head()

Unnamed: 0,id,services,turns,text
0,MUL0484.json,"[attraction, train]","{'turn_id': ['0', '1', '2', '3', '4', '5', '6'...",[Ich brauche Zugreservierungen von Norwich nac...
1,PMUL4462.json,"[restaurant, taxi, attraction]","{'turn_id': ['0', '1', '2', '3', '4', '5', '6'...","[Hallo, ich suche ein Restaurant in Cambridge...."
2,PMUL0320.json,"[restaurant, taxi, hotel]","{'turn_id': ['0', '1', '2', '3', '4', '5', '6'...","[Hallo, ich bin auf der Suche nach einem Hotel..."
3,MUL2155.json,"[train, hotel]","{'turn_id': ['0', '1', '2', '3', '4', '5', '6'...",[Ich suche eine Unterkunft im Norden der Stadt...
4,PMUL0815.json,"[restaurant, train]","{'turn_id': ['0', '1', '2', '3', '4', '5', '6'...","[Ich brauche einen Ort zum Essen, und ich würd..."


In [28]:
full_df_val.head()

Unnamed: 0,id,services,turns,text
0,PMUL0698.json,"[restaurant, train]","{'turn_id': ['0', '1', '2', '3', '4', '5', '6'...",[Ich suche einen lokalen Ort zum Essen im Zent...
1,PMUL3233.json,"[taxi, attraction, hotel]","{'turn_id': ['0', '1', '2', '3', '4', '5', '6'...",[Mein Mann und ich feiern unser Jubiläum und m...
2,SNG01627.json,[taxi],"{'turn_id': ['0', '1', '2', '3', '4', '5'], 's...","[Ich brauche ein Taxi, um bis 19:30 Uhr zum Ba..."
3,MUL1719.json,"[attraction, train]","{'turn_id': ['0', '1', '2', '3', '4', '5', '6'...","[Ich suche einen Ort im Westen der Stadt., Hab..."
4,MUL0242.json,"[restaurant, train]","{'turn_id': ['0', '1', '2', '3', '4', '5', '6'...","[Ich suche ein teures Restaurant im Zentrum., ..."


In [29]:
m2w_preprocessed_dataset = DatasetDict({
    "validation": Dataset.from_pandas(full_df_val),
    "test": Dataset.from_pandas(full_df_test)
    })

In [30]:
target_intents_m2w = []
for element in m2w_preprocessed_dataset:
    possible_intents = []
    input_texts = []
    for entry in m2w_preprocessed_dataset[element]:
        for turn in range(len(entry['turns']['turn_id'])):
            input_text = ''
            if entry['turns']['speaker'][turn] == 0: 
                #print("Der Anwender sagt: " + entry['turns']['utterance'][turn])
                input_text += f"Der Anwender sagt: {entry['text'][turn]}\n"
                # TBD: What intents could there be? 
                num_of_states = len(entry['turns']['frames'][turn]['state'])
                if num_of_states == 0:
                    intent = 'none'
                else:
                    intent = entry['turns']['frames'][turn]['state'][0]['active_intent']
                if 'find' in intent: 
                    m = re.search('find\_(.*)', intent)
                    target_intent = m.group(1)
                    #print("Question: Did the user intend to talk about finding some " + str(target_intent) + "?")
                    input_text += f"Frage: Beabsichtigt der Anwender ein {target_intent} zu finden ?\n"
                    target_intents_m2w.append(target_intent)
                if 'book' in intent: 
                    m = re.search('book\_(.*)', intent)
                    target_intent = m.group(1)
                    #print("Question: Did the user intend to talk about booking some " + str(target_intent) + "?")
                    input_text += f"Frage: Beabsichtigt der Anwender ein {target_intent} zu buchen ?\n"
                    target_intents_m2w.append(target_intent)
                    
                possible_intents.append(intent)
                input_texts.append(input_text)
    if element == 'validation':
        m2w_val_df = pd.DataFrame(input_texts, columns=['input'])
    else:
        m2w_test_df = pd.DataFrame(input_texts, columns=['input'])

In [31]:
m2w_val_df['labels'] = 'ja'
m2w_test_df['labels'] = 'ja'

In [32]:
len(m2w_val_df)

7374

In [33]:
#add true negatives
m2w_val_df_tp, m2w_val_df_tn = train_test_split(m2w_val_df, test_size=0.3, random_state=42)
m2w_test_df_tp, m2w_test_df_tn = train_test_split(m2w_test_df, test_size=0.3, random_state=42)

In [34]:
german_intents = ['Hotel', 'Restaurant', 'Attraktion', 'Zug', 'Taxi', 'Krankenhaus', 'Polizei']

In [35]:
# tn for val
for index, entry in m2w_val_df_tn.iterrows():
    new_input_text= ''
    sentence_intent = entry['input'].split()[-4]
    new_input_text += entry['input'].split(sep='\n')[0] + '\n'
    #print(sentence_intent)
    if re.match('finden\_(.*)', entry['input']):
        #print(f"Question: Did the user intend to talk about booking some {random.choice(list(unique_intents))} ?\n")
        new_input_text += f"Frage: Beabsichtigt der Anwender ein  {random.choice(german_intents)} zu buchen ?\n"
    else:
        #print(f"Question: Did the user intend to talk about finding some {random.choice(list(unique_intents))} ?\n")
        new_input_text += f"Frage: Beabsichtigt der Anwender ein  {random.choice(german_intents)} zu finden ?\n"
    m2w_val_df_tn.at[index, 'input'] = new_input_text

# tn for test
for index, entry in m2w_test_df_tn.iterrows():
    new_input_text= ''
    sentence_intent = entry['input'].split()[-4]
    new_input_text += entry['input'].split(sep='\n')[0] + '\n'
    #print(sentence_intent)
    if re.match('finden\_(.*)', entry['input']):
        #print(f"Question: Did the user intend to talk about booking some {random.choice(list(unique_intents))} ?\n")
        new_input_text += f"Question: Did the user intend to talk about booking some {random.choice(german_intents)} ?\n"
    else:
        #print(f"Question: Did the user intend to talk about finding some {random.choice(list(unique_intents))} ?\n")
        new_input_text += f"Question: Did the user intend to talk about finding some {random.choice(german_intents)} ?\n"
    m2w_test_df_tn.at[index, 'input'] = new_input_text

In [36]:
m2w_val_df_tn['labels'] = 'nein'
m2w_test_df_tn['labels'] = 'nein'

In [37]:
m2w_final_val_df = pd.concat([m2w_val_df_tp, m2w_val_df_tn])
m2w_final_test_df = pd.concat([m2w_test_df_tp, m2w_test_df_tn])

In [38]:
m2w_processed_dataset = DatasetDict({
    "train": Dataset.from_pandas(m2w_final_val_df),
    "test": Dataset.from_pandas(m2w_final_test_df)
    })

In [39]:
m2w_processed_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'labels', '__index_level_0__'],
        num_rows: 7374
    })
    test: Dataset({
        features: ['input', 'labels', '__index_level_0__'],
        num_rows: 7372
    })
})

# Combine MultiWOZ and Multi2WOZ Intent Dataset

In [40]:
#final_train_df
#final_val_df
#final_test_df

#m2w_final_val_df
#m2w_final_test_df


intent_combined_train_df = pd.concat([final_train_df[:12000], m2w_final_val_df])
intent_combined_test_df = pd.concat([final_test_df, m2w_final_test_df])


#intent_combined_train_df, intent_combined_test_df = train_test_split(pd.concat([final_train_df[:len(m2w_final_val_df)], m2w_final_val_df,final_test_df, m2w_final_test_df]), test_size=0.3, random_state=42)

In [41]:
combined_intent_dataset = DatasetDict({
    "train": Dataset.from_pandas(intent_combined_train_df),
    "test": Dataset.from_pandas(intent_combined_test_df)
    })

In [42]:
combined_intent_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'labels', '__index_level_0__'],
        num_rows: 19374
    })
    test: Dataset({
        features: ['input', 'labels', '__index_level_0__'],
        num_rows: 14744
    })
})

# MultiWOZ Slot Dataset Prep

In [43]:
possible_slots = []
for element in dataset:
    input_texts = []
    slots = []
    for entry in dataset[element]:
        for turn in range(len(entry['turns']['turn_id'])):
            if entry['turns']['speaker'][turn] == 0: 
                #print("The user says: " + entry['turns']['utterance'][turn])
                input_text = f"The user says: {entry['turns']['utterance'][turn]}\n"
                # TBD: What slots could there be? 
                num_of_states = len(entry['turns']['frames'][turn]['state'])
                if num_of_states == 0:
                    intent = 'none'
                else:
                    for state in entry['turns']['frames'][turn]['state']:
                        for i in range(len(state['slots_values']['slots_values_name'])):
                            slot_name = state['slots_values']['slots_values_name'][i]
                            slot_value = state['slots_values']['slots_values_list'][i]
                            m = re.search('(.*)-(.*)', slot_name)
                            #print("What is the " + m.group(2) + " of the " + m.group(1) + " mentioned in the sentence?")
                            input_text += f"What is the {m.group(2)} of the {m.group(1)} mentioned in the sentence?\n"
                            possible_slots.append(slot_name)
                            #print(state['slots_values']['slots_values_name'][i])
                            #print(state['slots_values']['slots_values_list'][i])
                            input_texts.append(input_text)
                            slots.append(slot_value[0])
    if element == 'train':
        slot_train_df = pd.DataFrame({'input':input_texts, 'labels':slots})
    elif element == 'validation':
        slot_val_df = pd.DataFrame({'input':input_texts, 'labels':slots})
    else:
        slot_test_df = pd.DataFrame({'input':input_texts, 'labels':slots})

In [44]:
processed_slot_dataset = DatasetDict({
    "train": Dataset.from_pandas(slot_train_df),
    "validation": Dataset.from_pandas(slot_val_df),
    "test": Dataset.from_pandas(slot_test_df)
    })

In [45]:
processed_slot_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'labels'],
        num_rows: 163411
    })
    validation: Dataset({
        features: ['input', 'labels'],
        num_rows: 21276
    })
    test: Dataset({
        features: ['input', 'labels'],
        num_rows: 22007
    })
})

# Multi2WOZ Slot Dataset Prep

In [56]:
possible_slots = []
for element in m2w_preprocessed_dataset:
    input_texts = []
    slots = []
    for entry in m2w_preprocessed_dataset[element]:
        for turn in range(len(entry['turns']['turn_id'])):
            if entry['turns']['speaker'][turn] == 0: 
                #print("The user says: " + entry['turns']['utterance'][turn])
                input_text = f"Der Anwender sagt: {entry['text'][turn]}\n"
                # TBD: What slots could there be? 
                num_of_states = len(entry['turns']['frames'][turn]['state'])
                if num_of_states == 0:
                    intent = 'none'
                else:
                    for state in entry['turns']['frames'][turn]['state']:
                        for i in range(len(state['slots_values']['slots_values_name'])):
                            slot_name = state['slots_values']['slots_values_name'][i]
                            slot_value = state['slots_values']['slots_values_list'][i]
                            m = re.search('(.*)-(.*)', slot_name)
                            #print("What is the " + m.group(2) + " of the " + m.group(1) + " mentioned in the sentence?")
                            input_text += f"Was ist der/die {m.group(2)} von {m.group(1)} welche in der Aussage erwähnt wird?\n"
                            possible_slots.append(slot_name)
                            #print(state['slots_values']['slots_values_name'][i])
                            #print(state['slots_values']['slots_values_list'][i])
                            #print(input_text)
                            input_texts.append(input_text)
                            slots.append(slot_value[0])
    if element == 'validation':
        m2w_slot_val_df = pd.DataFrame({'input':input_texts, 'labels':slots})
    else:
        m2w_slot_test_df = pd.DataFrame({'input':input_texts, 'labels':slots})

In [57]:
m2w_processed_slot_dataset = DatasetDict({
    "train": Dataset.from_pandas(m2w_slot_val_df[:14000]),
    "test": Dataset.from_pandas(m2w_slot_test_df[:8000])
    })

In [58]:
m2w_processed_slot_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'labels'],
        num_rows: 14000
    })
    test: Dataset({
        features: ['input', 'labels'],
        num_rows: 8000
    })
})

# Combine MultiWOZ and Multi2WOZ Slot Dataset

In [49]:
slot_combined_train_df = pd.concat([slot_val_df, m2w_slot_val_df])
slot_combined_test_df = pd.concat([slot_test_df[:10000], m2w_slot_test_df[:10000]])

In [50]:
combined_slot_dataset = DatasetDict({
    "train": Dataset.from_pandas(slot_combined_train_df),
    "test": Dataset.from_pandas(slot_combined_test_df)
    })

In [51]:
combined_slot_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'labels', '__index_level_0__'],
        num_rows: 42552
    })
    test: Dataset({
        features: ['input', 'labels', '__index_level_0__'],
        num_rows: 20000
    })
})

# Tokenizer and Model Prep

In [59]:
#Set Dataset here to train on different datasets

dataset = m2w_processed_slot_dataset

In [60]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.pad_token = tokenizer.eos_token

In [61]:
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["input"], truncation=True), batched=True, remove_columns=["input", "labels"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

Map:   0%|          | 0/22000 [00:00<?, ? examples/s]

Max source length: 299


In [62]:
# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["labels"], truncation=True), batched=True, remove_columns=["input", "labels"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/22000 [00:00<?, ? examples/s]

Max target length: 15


In [63]:
def preprocess_function(sample,padding="max_length"):
    # tokenize inputs
    model_inputs = tokenizer(sample["input"], max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text=sample["labels"], max_length=max_target_length, padding=padding, truncation=True)
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [64]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["input", "labels"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/14000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['labels', 'input_ids', 'attention_mask']


In [65]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [66]:
metric = evaluate.load("rouge")

In [67]:
# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result['f1_score'] = f1_score(decoded_labels, decoded_preds, average="macro")
    result['accuracy_score'] = accuracy_score(decoded_labels, decoded_preds)
    result['precision_score'] = precision_score(decoded_labels, decoded_preds, average="macro")
    result['recall_score'] = recall_score(decoded_labels, decoded_preds, average="macro")
    return result

In [68]:
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [62]:
init(model)

In [63]:
model.add_adapter('multi2woz',config='lora')

In [64]:
model.train_adapter('multi2woz')

In [65]:
# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir='results-flan-t5-mwoz',
    do_eval=True,
    evaluation_strategy="epoch",
    #do_train=False,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=0.001,
    num_train_epochs=4,
    # logging & evaluation strategies
    logging_dir=f"results-flan-t5-mwoz/logs",
    logging_strategy="steps",
    logging_steps=1000,
    save_strategy="epoch",
    save_total_limit=3,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    push_to_hub=False,
#    gradient_checkpointing=True,
#    gradient_accumulation_steps=4,
#    optim='adafactor',

)

# Create Trainer instance
trainer = Seq2SeqAdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [66]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,F1 Score,Accuracy Score,Precision Score,Recall Score
1,2.0511,1.200211,17.7956,3.5907,17.7656,17.7501,19.6845,0.02125,0.02125,0.02125,0.02125
2,1.187,1.056735,15.4137,2.7379,15.4011,15.3994,19.804375,0.02125,0.02125,0.02125,0.02125
3,1.017,0.969758,21.2727,3.9119,21.2528,21.271,19.61075,0.064125,0.064125,0.064125,0.064125
4,0.9424,0.945885,21.0147,4.0588,20.9935,21.0011,19.708125,0.0565,0.0565,0.0565,0.0565




TrainOutput(global_step=7000, training_loss=1.2357965262276787, metrics={'train_runtime': 6792.6433, 'train_samples_per_second': 8.244, 'train_steps_per_second': 1.031, 'total_flos': 2.285855244288e+16, 'train_loss': 1.2357965262276787, 'epoch': 4.0})

In [67]:
model.save_adapter("./flan-t5-adapter-slot-intent", "multi2woz")

In [None]:
trainer.evaluate()

# Evaluation

In [None]:
output = trainer.predict(tokenized_dataset["test"], max_new_tokens=max_target_length)
outputs = tokenizer.batch_decode(output.predictions, skip_special_tokens=True)

In [None]:
labels = [[idx for idx in label if idx!=-100] for label in tokenized_dataset["test"]["labels"]]
labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens=True)

In [None]:
print(f'f1 Score: '+str(f1_score(labels_decoded, outputs, average="macro")))

In [None]:
print(f"Accuracy: "+str(accuracy_score(labels_decoded, outputs)))

In [None]:
print(f"Precision: "+str(precision_score(labels_decoded, outputs, average='macro')))

In [None]:
print(f"Recall: "+str(recall_score(labels_decoded, outputs, average='macro')))

# Execute

In [None]:
from transformers import AutoTokenizer
from adapters import AutoAdapterModel

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoAdapterModel.from_pretrained(model_name)

In [None]:
mwoz_adapter = model.load_adapter("./mult2iwoz-flan-t5-adapter", "multi2woz")

In [None]:
model.set_active_adapters(mwoz_adapter)

In [None]:
from transformers import pipeline

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

In [None]:
result = pipe("The user says: I want to find a hotel.", max_length=100, num_beams=5, early_stopping=True)

# Test Base Model

In [69]:

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir='results-flan-t5-mwoz-base',
    do_eval=True,
    #do_train=False,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=0.001,
    num_train_epochs=4,
    # logging & evaluation strategies
    logging_dir=f"results-flan-t5-mwoz-base/logs",
    logging_strategy="steps",
    logging_steps=1000,
    save_strategy="epoch",
    save_total_limit=3,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    push_to_hub=False,
#    gradient_checkpointing=True,
#    gradient_accumulation_steps=4,
#    optim='adafactor',

)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [70]:
trainer.evaluate()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 6.688704013824463,
 'eval_rouge1': 2.4912,
 'eval_rouge2': 1.1382,
 'eval_rougeL': 2.477,
 'eval_rougeLsum': 2.48,
 'eval_gen_len': 19.327875,
 'eval_f1_score': 6.663113006396588e-05,
 'eval_accuracy_score': 0.000125,
 'eval_precision_score': 8.884150675195451e-05,
 'eval_recall_score': 5.330490405117271e-05,
 'eval_runtime': 745.41,
 'eval_samples_per_second': 10.732,
 'eval_steps_per_second': 1.342}