In [1]:
"""
Takes as input a dialogue file and creates a processed version of it.
If given an external dictionary, the input dialogue file will be converted
using that input dictionary.

@author Alessandro Sordoni, Iulian Vlad Serban
"""

import collections
import numpy
import operator
import os
import sys
import logging
import pickle
import pandas as pd

from collections import Counter

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('text2dict')

In [2]:
q_a_df = pd.read_csv('../../minerva/data/q_a_all.csv')


#"Dialogue file; assumed shuffled with one document 
# (e.g. one movie dialogue, or one Twitter conversation or one Ubuntu conversation) per line")


train = q_a_df.iloc[:25000]
validation = q_a_df.iloc[25000:26000]
test = q_a_df.iloc[26000:]

input_list = {'data/Q_A_dataset.txt': q_a_df,
              'data/Q_A_train.txt': train,
              'data/Q_A_validation.txt': validation,
              'data/Q_A_test.txt': test}


def write_file(df):
    for index, row in df.iterrows():
        text = row['Q'].strip() + ' </s> ' + row['A'].strip()
        if text != '':
            file.write(text + '\n')

for dialogue_file, df in input_list.items():
    with open(dialogue_file, 'w') as file:
        write_file(df)

    
q_a_df.tail()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Q,A,SubjectOfBusinessTitle,FloorLanguage,date,personSpeaking
27367,27367,27367,"BOS Mr. Speaker, yesterday, an American giant ...","BOS Mr. Speaker, I would like to thank the hon...",International Trade,FR,2016-02-04,"Ms. Brigitte Sansoucy (Saint-Hyacinthe—Bagot, ..."
27368,27368,27368,"BOS Mr. Speaker, the government has agreed to ...","BOS Mr. Speaker, that could not be further fro...",Employment Insurance,FR,2013-02-01,Hon. Stéphane Dion (Saint-Laurent—Cartierville...
27369,27369,27369,"BOS Mr. Speaker, while SMEs like the businesse...","BOS Mr. Speaker, we know that it is very impor...",Taxation,FR,2017-02-24,Mr. Alexandre Boulerice (Rosemont—La Petite-Pa...
27370,27370,27370,"BOS Mr. Speaker, as researcher Alain Deneault ...","BOS Mr. Speaker, our government is committed t...",Taxation,FR,2017-02-24,"Hon. Bill Morneau (Minister of Finance, Lib.)"
27371,27371,27371,"BOS Mr. Speaker, Canadians are tired of the cu...","BOS Mr. Speaker, I was very pleased last year ...",Government Accountability,EN,2017-02-24,"Ms. Cheryl Hardcastle (Windsor—Tecumseh, NDP)"


In [3]:
def safe_pickle(obj, filename):
    if os.path.isfile(filename):
        logger.info("Overwriting %s." % filename)
    else:
        logger.info("Saving to %s." % filename)
    
    with open(filename, 'wb') as f:
        pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)


# "Vocabulary cutoff (optional)")
cutoff = 1000




In [8]:

unk = "<unk>"

###############################
# Part I: Create the dictionary
###############################
def load_dict(dict_file):
    # Load external dictionary
    assert os.path.isfile(dict_file)
    vocab = dict([(x[0], x[1]) for x in pickle.load(open(dict_file, "rb"))])
    
    # Check consistency
    assert '<unk>' in vocab
    assert '</s>' in vocab
    assert '</d>' in vocab

    # Also check special tags, which must exist in the Movie-Scriptolog dataset
    assert '<first_speaker>' in vocab
    assert '<second_speaker>' in vocab
    assert '<third_speaker>' in vocab
    assert '<minor_speaker>' in vocab
    assert '<voice_over>' in vocab
    assert '<off_screen>' in vocab
    assert '<pause>' in vocab
    
#     return vocab

def create_vocab(input_file):
    word_counter = Counter()

    for line in open(input_file, 'r'):
        line_words = line.strip().split()
        
        if len(line_words) == 0:
            continue
            
        if line_words[len(line_words)-1] != '</s>':
            line_words.append('</s>')

        s = [x for x in line_words]
        word_counter.update(s) 

    total_freq = sum(word_counter.values())
    logger.info("Total word frequency in dictionary %d " % total_freq) 

    if cutoff != -1:
        logger.info("Cutoff %d" % cutoff)
        vocab_count = word_counter.most_common(cutoff)
    else:
        vocab_count = word_counter.most_common()

    # Add special tokens to the vocabulary
    vocab = {'<unk>': 0, '</s>': 1, '</d>': 2, 
             '<first_speaker>': 3, '<second_speaker>': 4, 
             '<third_speaker>': 5, '<minor_speaker>': 6, 
             '<voice_over>': 7, '<off_screen>': 8, '<pause>': 9}

    # Add other tokens to vocabulary in the order of their frequency
    i = 10
    for (word, count) in vocab_count:
        if not word in vocab:
            vocab[word] = i
            i += 1
            
    return vocab

In [5]:
#################################
# Part II: Binarize the dialogues
#################################

def create_dict(input_file, output):
    # Create vocab
    vocab = create_vocab(input_file)
    logger.info("Vocab size %d" % len(vocab))


    # Everything is loaded into memory for the moment
    binarized_corpus = []
    # Some statistics
    unknowns = 0.
    num_terms = 0.
    freqs = collections.defaultdict(lambda: 0)

    # counts the number of dialogues each unique word exists in; also known as document frequency
    df = collections.defaultdict(lambda: 0)

    for line, dialogue in enumerate(open(input_file, 'r')):
        dialogue_words = dialogue.strip().split()
        if len(dialogue_words) == 0:
            continue

        if dialogue_words[len(dialogue_words)-1] != '</s>':
            dialogue_words.append('</s>')

        # Convert words to token ids and compute some statistics
        dialogue_word_ids = []
        for word in dialogue_words:
            word_id = vocab.get(word, 0)
            dialogue_word_ids.append(word_id)
            unknowns += 1 * (word_id == 0)
            freqs[word_id] += 1

        num_terms += len(dialogue_words)

        # Compute document frequency statistics
        unique_word_indices = set(dialogue_word_ids)
        for word_id in unique_word_indices:
            df[word_id] += 1

        # Add dialogue to corpus
        binarized_corpus.append(dialogue_word_ids)

    safe_pickle(binarized_corpus, "data/%s.dialogues.pkl" % output)

    if dict_file == '':
         safe_pickle([(word, word_id, freqs[word_id], df[word_id]) for word, word_id in vocab.items()], 
                     'data/%s.dict.pkl' % output)

    logger.info("Number of unknowns %d" % unknowns)
    logger.info("Number of terms %d" % num_terms)
    logger.info("Mean document length %f" % float(sum(map(len, binarized_corpus))/len(binarized_corpus)))
    logger.info("Writing training %d dialogues (%d left out)" % (len(binarized_corpus), line + 1 - len(binarized_corpus)))
    


In [6]:
# python convert-text2dict.py <training_file> --cutoff <vocabulary_size> Training 
# python convert-text2dict.py <validation_file> --dict=Training.dict.pkl Validation 
# python convert-text2dict.py <test_file> --dict=Training.dict.pkl <vocabulary_size> Test

# # "Prefix of the pickle binarized dialogue corpus")
# output = 'data/'


# "External dictionary (pkl file)")
dict_file = ''

input_list = {'data/Q_A_dataset.txt': 'dataset',
              'data/Q_A_train.txt': 'train',
              'data/Q_A_validation.txt': 'validation',
              'data/Q_A_test.txt': 'test'}


for dialogue_file, output in input_list.items():
    create_dict(dialogue_file, output)


INFO:text2dict:Total word frequency in dictionary 5317375 
INFO:text2dict:Cutoff 1000
INFO:text2dict:Vocab size 1009
INFO:text2dict:Overwriting data/dataset.dialogues.pkl.
INFO:text2dict:Overwriting data/dataset.dict.pkl.
INFO:text2dict:Number of unknowns 1158506
INFO:text2dict:Number of terms 5317375
INFO:text2dict:Mean document length 184.464546
INFO:text2dict:Writing training 28826 dialogues (329 left out)
INFO:text2dict:Total word frequency in dictionary 260807 
INFO:text2dict:Cutoff 1000
INFO:text2dict:Vocab size 1009
INFO:text2dict:Overwriting data/test.dialogues.pkl.
INFO:text2dict:Overwriting data/test.dict.pkl.
INFO:text2dict:Number of unknowns 55372
INFO:text2dict:Number of terms 260807
INFO:text2dict:Mean document length 175.155809
INFO:text2dict:Writing training 1489 dialogues (25 left out)
INFO:text2dict:Total word frequency in dictionary 191334 
INFO:text2dict:Cutoff 1000
INFO:text2dict:Vocab size 1009
INFO:text2dict:Overwriting data/validation.dialogues.pkl.
INFO:text2di

In [9]:
load_dict('data/train.dict.pkl')