In [55]:
# DailyDialog Dataset

import os
import sys
import os.path
import shutil

os.chdir('/collection/ka2khan/thesis/Dialog_Generation/notebooks')
print(os.getcwd())

dataset_path = '/collection/ka2khan/thesis/Dialog_Generation/data/DailyDialog'
dataset_parts = ['train', 'valid', 'test']
orig_dir = 'original'
proc_dir = 'processed'

if os.path.isdir(os.path.join(dataset_path, proc_dir)):
    shutil.rmtree(os.path.join(dataset_path, proc_dir))


data_items = {}
for part in dataset_parts:
    dir_path = os.path.join(dataset_path, orig_dir)
    file_path = os.path.join(dir_path,  f'dialogues_{part}.txt')
    print(f'Reading data file: {file_path}')
    with open(file_path) as f_obj:
        lines = [line.strip() for line in f_obj.readlines() if line.strip() != '']
        data_items[part] = list(set(lines))

inver_train_items = {item: True for item in data_items['train']}

dedup_data_items = {'train': []}

for key, val in inver_train_items.items():
    dedup_data_items['train'].append(key)

for part in dataset_parts:
    if part == 'train':
        continue

    dedup_data_items[part] = []
    for item in data_items[part]:
        dialog = item
        if dialog not in inver_train_items:
            dedup_data_items[part].append(dialog)

for part in dataset_parts:
    print(f'Total dialogs in {part} before deduplication: {len(data_items[part])} after deduplication: {len(dedup_data_items[part])}')

utterances = {}
contexts = {}
queries = {}
responses = {}
for part in dataset_parts:

    utterances[part] = []
    contexts[part] = []
    queries[part] = []
    responses[part] = []
    for dialog in dedup_data_items[part]:
        dialog_utts = dialog.split('__eou__')

        dialog_utts = [utt.strip() for utt in dialog_utts if utt.strip() != '']

        utterances[part].extend(dialog_utts)
        #print(len(dialog_utts))
        for utt_index in range(1, len(dialog_utts)):
            #print(utt_index)
            #print(dialog_utts[:utt_index])
            #print(dialog_utts[utt_index])
            if utt_index == 1:
                contexts[part].append([])
            else:
                contexts[part].append(dialog_utts[:utt_index-1])

            queries[part].append(dialog_utts[utt_index-1])
            responses[part].append(dialog_utts[utt_index])

proc_dir_path = os.path.join(dataset_path, proc_dir)
os.makedirs(proc_dir_path)


utterance_dict = {}    
query_resp_dict = {}
for part in dataset_parts:            
    # Write Utterances File
    utterances_file_path = os.path.join(proc_dir_path, f'{part}_utterances.tsv')
    with open(utterances_file_path, 'w') as f_obj:
        for utterance in utterances[part]:
            if utterance in utterance_dict:
                continue
            else:
                utterance_dict[utterance] = True
                
            f_obj.write(f'{utterance}\n')

    # Write Dialog File
    dialog_file_path = os.path.join(proc_dir_path, f'{part}_dialog.tsv')
    with open(dialog_file_path, 'w') as f_obj:
        for index in range(len(contexts[part])):
            context = ' '.join(contexts[part][index])
            query = queries[part][index]
            response = responses[part][index]
            
            query_resp_pair = (query, response)
            
            if query_resp_pair in query_resp_dict:
                continue
            else:
                query_resp_dict[query_resp_pair] = True

            f_obj.write(f"{context}\t{query}\t{response}\n")
        
                    



/collection/ka2khan/thesis/Dialog_Generation/notebooks
Reading data file: /collection/ka2khan/thesis/Dialog_Generation/data/DailyDialog/raw/dialogues_train.txt
Reading data file: /collection/ka2khan/thesis/Dialog_Generation/data/DailyDialog/raw/dialogues_valid.txt
Reading data file: /collection/ka2khan/thesis/Dialog_Generation/data/DailyDialog/raw/dialogues_test.txt
Total dialogs in train before deduplication: 10549 after deduplication: 10549
Total dialogs in valid before deduplication: 995 after deduplication: 927
Total dialogs in test before deduplication: 996 after deduplication: 904


In [21]:
# 1-Billion Dataset

import os
import sys
import os.path
import shutil
import glob
from tqdm import tqdm
import numpy as np

os.chdir('/collection/ka2khan/thesis/Dialog_Generation/notebooks')
print(os.getcwd())

dataset_path = '/collection/ka2khan/thesis/Dialog_Generation/data/1-Billion'
dataset_parts = ['train', 'valid', 'test']
orig_dir = 'original'
proc_dir = 'processed'

if os.path.isdir(os.path.join(dataset_path, proc_dir)):
    shutil.rmtree(os.path.join(dataset_path, proc_dir))

os.makedirs(os.path.join(dataset_path, proc_dir))

    
# Read train set files
dir_path = os.path.join(dataset_path, orig_dir, 'train')
train_file_list = glob.glob(f'{dir_path}/news.en-*')
train_file_list.sort()

train_sentences = []
for file_path in tqdm(train_file_list):
    with open(file_path) as f_obj:
        train_sentences.extend(f_obj.readlines())
    
    

test_file_path = os.path.join(dataset_path, orig_dir, 'test/news.en-00000-of-00100')
with open(test_file_path) as f_obj:
    test_sentences = f_obj.readlines()

print(f'Total train sentences: {len(train_sentences)}')
print(f'Total test sentences: {len(test_sentences)}')

indices = np.arange(len(train_sentences))
np.random.shuffle(indices)

valid_indices = indices[:300000]
train_indices = indices[300000:5600000]

sentences = {'train': [], 'valid': [], 'test': []}
sentences['test'] = test_sentences

for index in train_indices:
    sentences['train'].append(train_sentences[index])

for index in valid_indices:
    sentences['valid'].append(train_sentences[index])

for part in sentences.keys():
    print(f'{part} - {len(sentences[part])} sentences')

proc_file_paths = {
    'train': '/collection/ka2khan/thesis/Dialog_Generation/data/1-Billion/processed/train_sentences.txt',
    'valid': '/collection/ka2khan/thesis/Dialog_Generation/data/1-Billion/processed/valid_sentences.txt',
    'test': '/collection/ka2khan/thesis/Dialog_Generation/data/1-Billion/processed/test_sentences.txt',
}

for part, file_path in proc_file_paths.items():
    with open(file_path, 'w') as f_obj:
        for sentence in sentences[part]:
            f_obj.write(sentence)

/collection/ka2khan/thesis/Dialog_Generation/notebooks


100%|██████████| 99/99 [00:08<00:00, 11.84it/s]


Total train sentences: 30301028
Total test sentences: 306688
train - 5300000 sentences
valid - 300000 sentences
test - 306688 sentences


In [9]:
# Switchboard Dataset
import pickle
import os

orig_dataset_path = '/collection/ka2khan/DialogWAE/data/SWDA/full_swda_clean_42da_sentiment_dialog_corpus.p'

dataset_path = '/collection/ka2khan/thesis/Cond_Text_Gen/data/SWDA'
dataset_parts = ['train', 'valid', 'test']
proc_dir = 'processed'

with open(orig_dataset_path, 'rb') as f_obj:
    orig_dataset = pickle.load(f_obj)

print(orig_dataset.keys())

parts = ['train', 'valid', 'test']
data_items = {}
for part in parts:
    data_items[part] = []
    for dialog in orig_dataset[part]:
        utterances = []
        last_speaker = None
        for utt in dialog['utts']:
            utterances.append(utt[1])
        data_items[part].append(utterances)
        

utterances = {}
contexts = {}
queries = {}
responses = {}
for part in dataset_parts:

    utterances[part] = []
    contexts[part] = []
    queries[part] = []
    responses[part] = []
    for dialog_utts in data_items[part]:
        dialog_utts = [utt.strip() for utt in dialog_utts if utt.strip() != '']

        utterances[part].extend(dialog_utts)
        #print(len(dialog_utts))
        for utt_index in range(1, len(dialog_utts)):
            #print(utt_index)
            #print(dialog_utts[:utt_index])
            #print(dialog_utts[utt_index])
            if utt_index == 1:
                contexts[part].append([])
            else:
                contexts[part].append(dialog_utts[:utt_index-1])

            queries[part].append(dialog_utts[utt_index-1])
            responses[part].append(dialog_utts[utt_index])

proc_dir_path = os.path.join(dataset_path, proc_dir)

if os.path.isdir(proc_dir_path):
    shutil.rmtree(proc_dir_path)            
            
os.makedirs(proc_dir_path)


utterance_dict = {}    
query_resp_dict = {}
for part in dataset_parts:            
    # Write Utterances File
    utterances_file_path = os.path.join(proc_dir_path, f'{part}_sentences.tsv')
    with open(utterances_file_path, 'w') as f_obj:
        for utterance in utterances[part]:
            if utterance in utterance_dict:
                continue
            else:
                utterance_dict[utterance] = True
                
            f_obj.write(f'{utterance}\n')

    # Write Dialog File
    dialog_file_path = os.path.join(proc_dir_path, f'{part}_tuples.tsv')
    with open(dialog_file_path, 'w') as f_obj:
        for index in range(len(contexts[part])):
            context = ' '.join(contexts[part][index])
            query = queries[part][index]
            response = responses[part][index]
            
            query_resp_pair = (query, response)
            
            if query_resp_pair in query_resp_dict:
                continue
            else:
                query_resp_dict[query_resp_pair] = True

            f_obj.write(f"{context}\t{query}\t{response}\n")

        


dict_keys(['test', 'train', 'valid'])


In [None]:
# DailyDialog Dataset

import os
import sys
import os.path
import shutil

os.chdir('/collection/ka2khan/thesis/Dialog_Generation/notebooks')
print(os.getcwd())

dataset_path = '/collection/ka2khan/thesis/Cond_Text_Gen/data/DailyDialog'
dataset_parts = ['train', 'valid', 'test']
orig_dir = 'original'
proc_dir = 'processed'

if os.path.isdir(os.path.join(dataset_path, proc_dir)):
    shutil.rmtree(os.path.join(dataset_path, proc_dir))


data_items = {}
for part in dataset_parts:
    dir_path = os.path.join(dataset_path, orig_dir)
    file_path = os.path.join(dir_path,  f'dialogues_{part}.txt')
    print(f'Reading data file: {file_path}')
    with open(file_path) as f_obj:
        lines = [line.strip() for line in f_obj.readlines() if line.strip() != '']
        data_items[part] = list(set(lines))

inver_train_items = {item: True for item in data_items['train']}

dedup_data_items = {'train': []}

for key, val in inver_train_items.items():
    dedup_data_items['train'].append(key)

for part in dataset_parts:
    if part == 'train':
        continue

    dedup_data_items[part] = []
    for item in data_items[part]:
        dialog = item
        if dialog not in inver_train_items:
            dedup_data_items[part].append(dialog)

for part in dataset_parts:
    print(f'Total dialogs in {part} before deduplication: {len(data_items[part])} after deduplication: {len(dedup_data_items[part])}')

utterances = {}
contexts = {}
queries = {}
responses = {}
for part in dataset_parts:

    utterances[part] = []
    contexts[part] = []
    queries[part] = []
    responses[part] = []
    for dialog in dedup_data_items[part]:
        dialog_utts = dialog.split('__eou__')

        dialog_utts = [utt.strip() for utt in dialog_utts if utt.strip() != '']

        utterances[part].extend(dialog_utts)
        #print(len(dialog_utts))
        for utt_index in range(1, len(dialog_utts)):
            #print(utt_index)
            #print(dialog_utts[:utt_index])
            #print(dialog_utts[utt_index])
            if utt_index == 1:
                contexts[part].append([])
            else:
                contexts[part].append(dialog_utts[:utt_index-1])

            queries[part].append(dialog_utts[utt_index-1])
            responses[part].append(dialog_utts[utt_index])

proc_dir_path = os.path.join(dataset_path, proc_dir)
os.makedirs(proc_dir_path)


utterance_dict = {}    
query_resp_dict = {}
for part in dataset_parts:            
    # Write Utterances File
    utterances_file_path = os.path.join(proc_dir_path, f'{part}_utterances.tsv')
    with open(utterances_file_path, 'w') as f_obj:
        for utterance in utterances[part]:
            if utterance in utterance_dict:
                continue
            else:
                utterance_dict[utterance] = True
                
            f_obj.write(f'{utterance}\n')

    # Write Dialog File
    dialog_file_path = os.path.join(proc_dir_path, f'{part}_dialog.tsv')
    with open(dialog_file_path, 'w') as f_obj:
        for index in range(len(contexts[part])):
            context = ' '.join(contexts[part][index])
            query = queries[part][index]
            response = responses[part][index]
            
            query_resp_pair = (query, response)
            
            if query_resp_pair in query_resp_dict:
                continue
            else:
                query_resp_dict[query_resp_pair] = True

            f_obj.write(f"{context}\t{query}\t{response}\n")


In [21]:
# ROCStories Dataset

import os
import sys
import os.path
import shutil
import csv

os.chdir('/collection/ka2khan/thesis/Cond_Text_Gen/notebooks')
print(os.getcwd())

dataset_path = '/collection/ka2khan/thesis/Cond_Text_Gen/data/ROCStories'
dataset_parts = ['train', 'valid', 'test']
orig_dir = 'original'
proc_dir = 'processed'

if os.path.isdir(os.path.join(dataset_path, proc_dir)):
    shutil.rmtree(os.path.join(dataset_path, proc_dir))

os.makedirs(os.path.join(dataset_path, proc_dir))

data_items = {}
for part in dataset_parts:
    dir_path = os.path.join(dataset_path, orig_dir)
    file_path = os.path.join(dir_path,  f'{part}.csv')
    print(f'Reading data file: {file_path}')
    data_items[part] = []
    with open(file_path) as f_obj:
        reader = csv.reader(f_obj)
        for index, row in enumerate(reader):
            if index ==0:
                continue

            data_items[part].append(row)    

print()
for part in dataset_parts:
    print(f'Train contains {len(data_items[part])} items.')
    
sentences = {}
first_sents = {}
contexts = {}
curr_sents = {}
next_sents = {}
for part in dataset_parts:
    sentences[part] = []
    contexts[part] = []
    first_sents[part] = []
    curr_sents[part] = []
    next_sents[part] = []
    
    for item in data_items[part]:
        if part == 'train':
            assert len(item) == 7, f'Error! train item does not contain exactly 7 parts! {item}'
            
            story_sents = item[2:]
            
            story_sents = [sent.strip() for sent in story_sents]
            
            sentences[part].extend(story_sents)
            
            first_sents[part].append(story_sents[0])
            
            for sent_index in range(1, len(story_sents)):
                if sent_index == 1:
                    contexts[part].append([])
                else:                
                    contexts[part].append(story_sents[:sent_index-1])

                curr_sents[part].append(story_sents[sent_index-1])
                next_sents[part].append(story_sents[sent_index])
        else:        
            assert len(item) == 8, f'Error! {part} item does not contain exactly 8 parts! {item}'
            
            story_sents = item[1:5]
            
            story_sents = [sent.strip() for sent in story_sents]
            
            con_alts = item[5:7]
            true_ending = int(item[-1])
            story_sents.append(con_alts[true_ending-1])
            
            sentences[part].extend(story_sents)
            
            first_sents[part].append(story_sents[0])
            
            contexts[part].append(story_sents[:len(story_sents)-2])
                
            curr_sents[part].append(story_sents[len(story_sents)-2])
            next_sents[part].append(story_sents[len(story_sents)-1])
        
        
            
for part in dataset_parts:
    print()
    print(f'{part} contains:')
    print(f'{len(sentences[part])} sentences.')
    print(f'{len(first_sents[part])} first sentences.')
    print(f'{len(contexts[part])} contexts.')
    print(f'{len(curr_sents[part])} current sentences.')
    print(f'{len(next_sents[part])} next sentences.')
    
    file_path = os.path.join(dataset_path, proc_dir, f'{part}_sentences.tsv')
    with open(file_path, 'w') as f_obj:
        for sent in sentences[part]:
            f_obj.write(f'{sent}\n')
    
    file_path = os.path.join(dataset_path, proc_dir, f'{part}_first_sents.tsv')
    with open(file_path, 'w') as f_obj:
        for sent in first_sents[part]:
            f_obj.write(f'{sent}\n')
            
    file_path = os.path.join(dataset_path, proc_dir, f'{part}_tuples.tsv')
    with open(file_path, 'w') as f_obj:
        for index in range(len(contexts[part])):
            f_obj.write(f"{' '.join(contexts[part][index])}\t{curr_sents[part][index]}\t{next_sents[part][index]}\n")
    


/collection/ka2khan/thesis/Cond_Text_Gen/notebooks
Reading data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/ROCStories/original/train.csv
Reading data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/ROCStories/original/valid.csv
Reading data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/ROCStories/original/test.csv

Train contains 98161 items.
Train contains 1871 items.
Train contains 1871 items.

train contains:
490805 sentences.
98161 first sentences.
392644 contexts.
392644 current sentences.
392644 next sentences.

valid contains:
9355 sentences.
1871 first sentences.
1871 contexts.
1871 current sentences.
1871 next sentences.

test contains:
9355 sentences.
1871 first sentences.
1871 contexts.
1871 current sentences.
1871 next sentences.


In [33]:
# Taskmaster-2 dataset
import os
import sys
import os.path
import shutil
import csv
import glob
import json
import numpy as np
import math

os.chdir('/collection/ka2khan/thesis/Cond_Text_Gen/notebooks')
print(os.getcwd())

dataset_path = '/collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2'
dataset_parts = ['train', 'valid', 'test']
orig_dir = 'original'
proc_dir = 'processed'

if os.path.isdir(os.path.join(dataset_path, proc_dir)):
    shutil.rmtree(os.path.join(dataset_path, proc_dir))

os.makedirs(os.path.join(dataset_path, proc_dir))

original_path = os.path.join(dataset_path, orig_dir)

data_files = glob.glob(f'{original_path}/*.json')


train = []
valid = []
test = []
utt_counts = {}
diag_counts = {}
for data_file in data_files:
    print()
    print(f'Processing data file: {data_file}')
    with open(data_file) as f_obj:
        data = json.load(f_obj)
        
    dialogs = []
    utt_counts[data_file] = 0
    for item in data:
        utterances = []
        prev_speaker = None
        for utt_details in item['utterances']:
            if utt_details['speaker'].lower() == prev_speaker:
                utterances[-1] = utterances[-1] + ' ' + utt_details['text']
            else:
                utterances.append(utt_details['text'])
                prev_speaker = utt_details['speaker'].lower()
            
        
        utt_counts[data_file] += len(utterances)
        dialogs.append(utterances)
    
    diag_counts[data_file] = len(dialogs)
    
    print(f'Dialogs found: {diag_counts[data_file]}, Utterances found: {utt_counts[data_file]}')
    
    indices = np.arange(len(dialogs))
    
    #randomly shuffle indices
    np.random.shuffle(indices)
    
    test_indices = indices[:math.ceil(len(indices)*0.1)]
    valid_indices = indices[math.ceil(len(indices)*0.1):2*math.ceil(len(indices)*0.1)]
    train_indices = indices[2*math.ceil(len(indices)*0.1):]
    
    print(f'Training set size: {len(train_indices)}')
    print(f'Validation set size: {len(valid_indices)}')
    print(f'Test set size: {len(test_indices)}')
    
    for index in train_indices:
        train.append(dialogs[index])
    
    for index in valid_indices:
        valid.append(dialogs[index])
    
    for index in test_indices:
        test.append(dialogs[index])
        
    
print(f'Total Dialogs found: {np.sum(list(diag_counts.values()))}, Utterances found: {np.sum(list(utt_counts.values()))}')

print()
print(f'Training set contains {len(train)} dialogs.')
print(f'Validation set contains {len(valid)} dialogs.')
print(f'Test set contains {len(test)} dialogs.')

data = {'train': train, 'valid': valid, 'test': test}

utterances = {}
contexts = {}
queries = {}
responses = {}
for part in dataset_parts:

    utterances[part] = []
    contexts[part] = []
    queries[part] = []
    responses[part] = []
    for dialog_utts in data[part]:
        
        dialog_utts = [utt.strip() for utt in dialog_utts if utt.strip() != '']

        utterances[part].extend(dialog_utts)
        #print(len(dialog_utts))
        for utt_index in range(1, len(dialog_utts)):
            #print(utt_index)
            #print(dialog_utts[:utt_index])
            #print(dialog_utts[utt_index])
            if utt_index == 1:
                contexts[part].append([])
            else:
                contexts[part].append(dialog_utts[:utt_index-1])

            queries[part].append(dialog_utts[utt_index-1])
            responses[part].append(dialog_utts[utt_index])

proc_dir_path = os.path.join(dataset_path, proc_dir)
            
utterance_dict = {}    
query_resp_dict = {}
for part in dataset_parts:            
    # Write Utterances File
    utterances_file_path = os.path.join(proc_dir_path, f'{part}_sentences.tsv')
    with open(utterances_file_path, 'w') as f_obj:
        for utterance in utterances[part]:
            if utterance in utterance_dict:
                continue
            else:
                utterance_dict[utterance] = True
                
            f_obj.write(f'{utterance}\n')

    # Write Dialog File
    dialog_file_path = os.path.join(proc_dir_path, f'{part}_tuples.tsv')
    with open(dialog_file_path, 'w') as f_obj:
        for index in range(len(contexts[part])):
            context = ' '.join(contexts[part][index])
            query = queries[part][index]
            response = responses[part][index]
            
            query_resp_pair = (query, response)
            
            if query_resp_pair in query_resp_dict:
                continue
            else:
                query_resp_dict[query_resp_pair] = True

            f_obj.write(f"{context}\t{query}\t{response}\n")
 


/collection/ka2khan/thesis/Cond_Text_Gen/notebooks

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/original/movies.json
Dialogs found: 3056, Utterances found: 51129
Training set size: 2444
Validation set size: 306
Test set size: 306

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/original/sports.json
Dialogs found: 3481, Utterances found: 44796
Training set size: 2783
Validation set size: 349
Test set size: 349

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/original/hotels.json
Dialogs found: 2357, Utterances found: 52700
Training set size: 1885
Validation set size: 236
Test set size: 236

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/original/music.json
Dialogs found: 1603, Utterances found: 21358
Training set size: 1281
Validation set size: 161
Test set size: 161

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/orig

In [9]:
import os
import sys
import os.path
import shutil
import csv
import glob
import json
import numpy as np
import math
import csv

os.chdir('/collection/ka2khan/thesis/Cond_Text_Gen/notebooks')
print(os.getcwd())

dataset_path = '/collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2'
dataset_parts = ['train', 'valid', 'test']
orig_dir = 'original'
proc_dir = 'processed'

dest_path = '/collection/ka2khan/DialogWAE/data/Taskmaster2'

original_path = os.path.join(dataset_path, orig_dir)

data_files = glob.glob(f'{original_path}/*.json')

dialogs = {}
for data_file in data_files:
    print()
    print(f'Processing data file: {data_file}')
    with open(data_file) as f_obj:
        data = json.load(f_obj)
        
    for item in data:
        utterances = []
        prev_speaker = None
        for utt_details in item['utterances']:
            if utt_details['speaker'].lower() == prev_speaker:
                utterances[-1] = utterances[-1] + ' ' + (utt_details['text'].strip())
            else:
                utterances.append(utt_details['text'].strip())
                prev_speaker = utt_details['speaker'].lower()
            
        
        dialogs[utterances[0]] = utterances
    

print(f'Total Dialogs found: {len(dialogs)}')

data_items = {}
for part in dataset_parts:
    data_items[part] = []
    file_path = os.path.join(dataset_path, proc_dir, f'{part}_tuples.tsv')
    f_obj = open(file_path)
    reader = csv.reader(f_obj, delimiter='\t')
    
    for row in reader:
        
        if row[0].strip() != '':
            continue
        
        data_items[part].append(dialogs[row[1]])

    print(f'{part} contains {len(data_items[part])} dialogs')
    
    f_obj = open(os.path.join(dest_path, f'{part}.utts.txt'), 'w')
    for dialog in data_items[part]:
        line = ' __eou__ '.join(dialog)
        f_obj.write(f'{line}\n')
    f_obj.close()



/collection/ka2khan/thesis/Cond_Text_Gen/notebooks

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/original/movies.json

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/original/sports.json

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/original/hotels.json

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/original/music.json

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/original/restaurant-search.json

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/original/food-ordering.json

Processing data file: /collection/ka2khan/thesis/Cond_Text_Gen/data/Taskmaster-2/original/flights.json
Total Dialogs found: 5794
train contains 12865 dialogs
valid contains 1574 dialogs
test contains 1577 dialogs
