# Ubuntu Dialogue Corpus Exploration

## Dataset Description

Random facts [from paper](https://arxiv.org/abs/1506.08909):
* 2-way (dyadic) conversation, as opposed to multi-participant.


## Data Pre-Processing

In [3]:
#from keras.models import Sequential
#from keras.layers import Dense, Activation
import numpy as np
import os.path
import pandas as pd

TRAIN_PATH = '/home/brandon/terabyte/Datasets/ubuntu_dialogue_corpus/src/train.csv'
VOCAB_SIZE = 2000

### Tokenize [focusing on training data only for now]

Goal: Convert sentences in data to integer sequences. 

In [4]:
# First, we need to load the data directly into a dataframe from the train.csv file. 
df_train = pd.read_csv(TRAIN_PATH)
# Remove all examples with label = 0. (why would i want to train on false examples?)
df_train = df_train.loc[df_train['Label'] == 1.0]
# Don't care about the pandas indices in the df, so remove them.
df_train = df_train.reset_index(drop=True)
df_train.head()

Unnamed: 0,Context,Utterance,Label
0,i think we could import the old comments via r...,basically each xfree86 upload will NOT force u...,1.0
1,interesting __eou__ grub-install worked with /...,thats the one __eou__,1.0
2,and because Python gives Mark a woody __eou__ ...,(i thought someone was going to make a joke ab...,1.0
3,"edd will turn up here soon too, btw __eou__ __...",you need to go out of the sleepy mode dude __e...,1.0
4,do you know about this problem ? __eou__ __eot...,unset LANGUAGE && LANG=en_GB.UTF-8 gnome-termi...,1.0


In [5]:
# Get the df as a single text string.
def df_to_string(df):
    """ Expects df to be 3 columns of form above. """
    # Remove the 'label' column since we are only interested in the text here.
    df_text = df.copy()
    del df_text['Label']
    text = df_text['Context'].str.cat(sep=' ') + ' ' + df_text['Utterance'].str.cat(sep=' ')
    return text
text_train = df_to_string(df_train)
print("Number of characters (total) in train.csv:", len(text_train))

Number of characters (total) in train.csv: 2610845


In [6]:
# Now get all of the data in a single string and make a 'vocabulary' (unique words). 
import nltk, re, pprint
from nltk import word_tokenize
tokens_train = word_tokenize(text_train)
vocab_train = sorted(set(tokens_train)) # Sorted 'alphabetically', NOT frequency!!
print("type(vocab_train) = ", type(vocab_train))
print("There are {} unique words in train.csv.".format(len(vocab_train)))

type(vocab_train) =  <class 'list'>
There are 25583 unique words in train.csv.


In [7]:
freq_dist = nltk.FreqDist(tokens_train)
most_common = freq_dist.most_common(VOCAB_SIZE)
print("Most common word [in vocab] was {} and appeared {} times.".format(*most_common[0]))
print("Least common word [in vocab] was {} and appeared {} times.".format(*most_common[-1]))

Most common word [in vocab] was __eou__ and appeared 43035 times.
Least common word [in vocab] was serial and appeared 15 times.


In [8]:
word_to_index = {w: i for i, w in enumerate(np.array(most_common)[:, 0])}
index_to_word = {i: w for i, w in enumerate(np.array(most_common)[:, 0])}

In [15]:
def array_to_indices(arr):
    return np.array([[word_to_index[w] if w in word_to_index else -1 for w in word_tokenize(item)]\
            for item in arr])

context_as_indices = array_to_indices(df_train['Context'].values)
utter_as_indices = array_to_indices(df_train['Utterance'].values)
# Check output is sensible.
#[index_to_word[i] if i in index_to_word else 'UNKNOWN' for i in context_as_indices[0]]

In [11]:
df_index_train = pd.DataFrame(np.hstack((context_as_indices[:, None], utter_as_indices[:, None])), 
                              columns=['Context', 'Utterance'])
print(len(df_index_train))
df_index_train.head()

4993


Unnamed: 0,Context,Utterance
0,"[12, 78, 93, 113, -1, 3, 312, 1942, 478, 999, ...","[656, 694, 1370, 1254, 71, 1470, 1209, 351, 5,..."
1,"[657, 0, -1, 343, 28, 218, 292, 896, 2, 855, 7...","[247, 3, 72, 0]"
2,"[15, 147, 1666, 681, 1703, 9, 1584, 0, 1, 12, ...","[34, 12, 259, 229, 57, 191, 5, 120, 9, -1, 62,..."
3,"[-1, 71, 711, 74, 108, 520, 121, 2, 511, 0, 1,...","[10, 67, 5, 148, 90, 24, 3, -1, 533, 532, 0, -..."
4,"[19, 10, 55, 62, 53, 89, 4, 0, 1, 42, 0, 1, 93...","[-1, -1, 347, 347, -1, 1508, 4, 0, -1, 11, 200..."


In [16]:
print(VOCAB_SIZE)
n_latent_factors = 42
n_hidden = 256

Input(shape=(1,), dtype='int64', name='input')
Embedding(input_dim=VOCAB_SIZE, output_dim=42, input_length=1)
LSTM(output_dim=256)
LSTM(output_dim=VOCAB_SIZE)
Dense(output_dim=VOCAB_SIZE, activation='softmax')

2000


## Making the Keras Seq2Seq Model

In [28]:
import seq2seq
from seq2seq.models import SimpleSeq2Seq
from keras.layers import Embedding, Input, Flatten
from keras.models import Sequential

model = Sequential()
#model.add(Input(shape=(1,), dtype='int64', name='inputs'))
model.add(Embedding(input_dim=1, output_dim=VOCAB_SIZE+1))
#model.add(Flatten())
seq2seqmod = SimpleSeq2Seq(input_dim=VOCAB_SIZE+1, hidden_dim=10, output_length=8, output_dim=VOCAB_SIZE+1)
model.add(seq2seqmod)
model.compile(loss='mse', optimizer='rmsprop')
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_6 (Embedding)          (None, None, 2001)    2001        embedding_input_5[0][0]          
____________________________________________________________________________________________________
sequential_34 (Sequential)       (None, 8, 2001)       16184528    embedding_6[0][0]                
Total params: 16,186,529
Trainable params: 16,186,529
Non-trainable params: 0
____________________________________________________________________________________________________


In [32]:
X_train, y_train = df_index_train['Context'].values, df_index_train['Utterance'].values
print(X_train.shape, y_train.shape)
model.fit(np.stack(X_train, 1), y_train, nb_epoch=1)

(4993,) (4993,)


ValueError: all input arrays must have the same shape

### Random stuff I did earlier with the train df.

In [71]:
lab_one.size
lab_one = pos.loc[pos['Label'] == 1.0]
lab_one.head()
c = lab_one['Utterance'].values[3]
c

14979

In [58]:
def print_conversation(df, i=0):
    context = df['Context'].values[i]
    context_split = context.split('__eot__')
    print('--------------------- CONTEXT ------------------- ')
    for utter in context_split:
        u_split = utter.split('__eou__')
        for u in u_split:
            print(u)
        print('')
    utterance = df['Utterance'].values[i]
    print('--------------------- RESPONSE ------------------- ')
    for u in utterance.split('__eou__'):
        print(u)

print_conversation(lab_one, 0)

--------------------- CONTEXT ------------------- 
i think we could import the old comments via rsync, but from there we need to go via email. I think it is easier than caching the status on each bug and than import bits here and there 
 

 it would be very easy to keep a hash db of message-ids  
 sounds good 
 

 ok 
 perhaps we can ship an ad-hoc apt_prefereces 
 

 version? 
 

 thanks 
 

 not yet 
 it is covered by your insurance? 
 

 yes 
 but it's really not the right time :/ 
 with a changing house upcoming in 3 weeks 
 

 you will be moving into your house soon? 
 posted a message recently which explains what to do if the autoconfiguration does not do what you expect 
 

 how urgent is #896? 
 

 not particularly urgent, but a policy violation 
 

 i agree that we should kill the -novtswitch 
 

 ok 
 

 would you consider a package split a feature? 
 

 context? 
 

 splitting xfonts* out of xfree86*. one upload for the rest of the life and that's it 
 

 splitting the sourc

In [72]:
text = ''
i = 0
for row in lab_one.values[:2000]:
    for col in row[:2]:
        text += col

In [73]:
unique_chars = sorted(list(set(text)))
print('total unique chars:', len(unique_chars))
char_indices = dict((c, i) for i, c in enumerate(unique_chars))
indices_char = dict((i, c) for i, c in enumerate(unique_chars))

total unique chars: 95


In [74]:
def print_sentence_array(sentences):
    print("Contents of sentences:")
    for i, s in enumerate(sentences): print(i, ":    ", s)

# cut the text in semi-redundant sequences of maxlen characters
sequence_length = 40
sequence_stride = 3
sequences = [] 
next_chars = []

for i in range(0, len(text) - sequence_length, sequence_stride):
    sequences.append(text[i: i + sequence_length])
    # next_chars contains the single char that came after the sequence in line above.
    next_chars.append(text[i + sequence_length])
assert(len(sequences) == len(next_chars))
print('nb sequences:', len(sequences))
print_sentence_array(sequences[:10])

print('Vectorization...')
# X is a boolean grid over the unique characters. Basically one-hot encoding every since char in sequences (I know). 
X = np.zeros((len(sequences), sequence_length, len(unique_chars)), dtype=np.bool)
y = np.zeros((len(next_chars), len(unique_chars)), dtype=np.bool)
for i_seq, seq in enumerate(sequences):
    for i_char, char in enumerate(seq):
        # One-hot encode. Yup. DON'T MESS WITH THIS ITS FASTER
        X[i_seq, i_char, char_indices[char]] = 1
    y[i_seq, char_indices[next_chars[i_seq]]] = 1

nb sequences: 356796
Contents of sentences:
0 :     i think we could import the old comments
1 :     hink we could import the old comments vi
2 :     k we could import the old comments via r
3 :     e could import the old comments via rsyn
4 :     ould import the old comments via rsync, 
5 :     d import the old comments via rsync, but
6 :     mport the old comments via rsync, but fr
7 :     rt the old comments via rsync, but from 
8 :     the old comments via rsync, but from the
9 :      old comments via rsync, but from there 
Vectorization...


In [75]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

def sample_char(preds, temperature=1.0):
    """ Helper function to sample a character from a probability array. """
    
    # Convert preds from boolean to float. 
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(n=1, pvals=preds, size=1)
    return indices_char[np.argmax(probas)]

def one_hot(sequence):
    res = np.zeros((1, sequence_length, len(unique_chars)))
    for t, char in enumerate(sequence):
        res[0, t, char_indices[char]] = 1.
    return res

# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(output_dim=128, input_shape=(sequence_length, len(unique_chars))))
model.add(Dense(output_dim=len(unique_chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [78]:
def generate_sentence(seed="When I think about life", temperature=1.0):
    start_index = random.randint(0, len(text) - sequence_length - 1)
    
    print('\nTemperature:', temperature)
    #sentence  = text[start_index: start_index + sequence_length]
    sentence = seed
    generated = sentence
    print('Generating with seed: "' + sentence + '"')

    for i in range(400):
        x = one_hot(sentence)
        preds = model.predict(x, verbose=0)[0]
        next_char  = sample_char(preds, temperature)
        generated += next_char
        sentence   = sentence[1:] + next_char
        #print(next_char, end='')
        #sys.stdout.flush()
    return(generated)

#train the model, output generated text after each iteration
for iteration in range(1, 2):
    print('Iteration', iteration)
    model.fit(X, y, batch_size=128, nb_epoch=1)

Iteration 1
Epoch 1/1
 22144/356796 [>.............................] - ETA: 79s - loss: 1.4675 

KeyboardInterrupt: 

In [80]:
generate_sentence()


Temperature: 1.0
Generating with seed: "When I think about life"


'When I think about lifeh"n/H0,"S::NkxhDmD/f6iIg>S_vy- Mx:FYI!1SRC-:GS_ T//,C4bP"SdhG:py/*i-lb:_6no?x./"hN-LG:-  - yl-3mD2 _2/c523-FO"v-;_.3Pi\'4NIjk 3?M-*_PNC2\'d.3 #.W1_\'Fr)"-wn-q/AA3I:.uyxNpC-?)S-DY;?A_G W_c)AIg(b,2S__q-wMCC;miDz"."-gf5MF_c-m,:g/6F zo"-.fSum"vGphdO6:1_m)o?hYIN/43j_a".m;"Yjm"/S 4!w2j4_/FCby:_T A1_/Tju-k:eg",g1tS (8.:j31"!_ 1G":"wzg--f(")h"/  "-TGkpFe__kxCP"Wj!-o"=lCN.Ud .t":S.U0DS0(-2i)c"vw_l- gIp\'mgqCB2'

# Code that may be useful but idk

In [None]:
if not os.path.isfile("data/pos.npy"):
    pos = np.loadtxt('data/pos.csv', delimiter=',', dtype=np.float32)
    np.save('data/pos.npy', pos);
else:
    pos = np.load('data/pos.npy')

if not os.path.isfile("data/neg.npy"):
    neg = np.loadtxt('data/neg.csv', delimiter=',', dtype=np.float32)
    np.save('data/neg.npy', neg);
else:
    neg = np.load('data/neg.npy')

In [None]:
nltk_text_train = nltk.Text(tokens_train)
# collocation: the habitual juxtaposition of a particular word with another word or words with a frequency greater than chance.
nltk_text_train.collocations()

In [None]:
print('Vectorization...')
# X is a boolean grid over the unique characters. Basically one-hot encoding every since char in sequences (I know). 
X = np.zeros_like((len(df_index_train), VOCAB_SIZE+1, len(unique_chars)), dtype=np.bool)
y = np.zeros((len(df_index_train), VOCAB_SIZE+1), dtype=np.bool)
for i_seq, seq in enumerate(sequences):
    for i_char, char in enumerate(seq):
        # One-hot encode. Yup. DON'T MESS WITH THIS ITS FASTER
        X[i_seq, i_char, char_indices[char]] = 1
    y[i_seq, char_indices[next_chars[i_seq]]] = 1