In [0]:
import git
import os
import zipfile
import shutil
import tempfile

In [0]:
# this will download the necessary data we will be working with
if not os.path.exists('./AI_workshop2019/swda_master'):
    print('Downloading the most recent snapshot of the swda git repo...')
    # download the swda repository from github
    git.Repo.clone_from('https://github.com/cgpotts/swda', './AI_workshop2019/swda_master', branch='master', depth=1)
    # Create temporary dir
    t = tempfile.mkdtemp()

    # unzip the zipped corpus located in the swda_master folder
    with zipfile.ZipFile('./AI_workshop2019/swda_master/swda.zip', 'r') as zip_ref:
        zip_ref.extractall(t)

    # move the corpus folder to the current master folder then remove the temp dir
    shutil.move(os.path.join(t, 'swda'), './AI_workshop2019/swda_master')
    shutil.rmtree(t)

    print('Successfully downloaded the swda repository!')

## creates an html file that contains the structure of the directory
#os.system('tree -H ./AI_workshop2019/swda_master > ./AI_workshop2019/directory_tree.html')
## prints the files/folders in the immediate directory and their sizes
#print('Here is a glimpse of the directory structure.')
#os.system('du -sh ./AI_workshop2019/swda_master/*')

Now we will import the packages required to look at the data.

In [0]:
# data wrangling
import numpy as np
import pandas as pd

# plotting
import matplotlib.pyplot as plt

# QOL
from tqdm import tqdm
from collections import defaultdict

# stats
from scipy.stats import ks_2samp

# nltk
import nltk
from nltk.corpus import treebank

# function from swda dataset
import sys
sys.path.insert(0, './AI_workshop2019/swda_master/')

from swda import CorpusReader

In [0]:
def treebank_pos_dist():
    """Build a POS relative frequency distribution for the NLTK subset of the WSJ Treebank."""
    d = defaultdict(int)
    for fileid in treebank.fileids():
        for word in treebank.tagged_words(fileid):
            d[word[1]] += 1
    dist = {}
    total = float(sum(d.values()))
    for key, val in d.items():
         dist[key] = d[key] / total
    return dist

def tag_dist():
    """Gather and print relative frequency distribution of the tags."""
    d = defaultdict(int)
    corpus = CorpusReader('swda')
    # Loop, counting tags:
    # the inclusion of tqdm() wrapper provides a timer for the for loop
    # if the length of the object is known, a progress bar is displayed
    # if unknown, then you just get a standard timer
    for utt in tqdm(corpus.iter_utterances(display_progress=False)):
        for x in np.asarray(utt.pos_lemmas(wn_lemmatize=False)):
            d[x[1]] += 1
    dist_swda = {}
    total = float(sum(d.values()))
    for key, val in d.items():
        dist_swda[key] = d[key] / total
    return dist_swda

In [0]:
parent_dir = os.getcwd()
os.chdir('./AI_workshop2019/swda_master/')

# check out the dictionary generated for the swda corpus
swda_POSdist = tag_dist()
swda_POSdist

In [0]:
# and for the WSJ subset
nltk.download('treebank')
WSJ_POSdist = treebank_pos_dist()
WSJ_POSdist

In [0]:
# get total counts of the POS tags for each dataset
count = 0
for x in WSJ_POSdist.keys():
    count += 1
print('The total count of different POS tags in the WSJ subset is {}.'.format(count))

count = 0
for x in swda_POSdist.keys():
    count += 1
print('The total count of different POS tags in the SWDA corpus is {}.'.format(count))

In [0]:
# sort in descending order before plotting
sorted_WSJ = sorted(WSJ_POSdist.items(), key=lambda x: x[1], reverse=True)
sorted_SWDA = sorted(swda_POSdist.items(), key=lambda x: x[1], reverse=True)

In [0]:
# rotate the x tick markers to try to help reduce clutter
plt.xticks(rotation=50)
# unpacks and plots the list of tuples
plt.bar(*zip(*sorted_SWDA), color='#FFD700')
plt.ylabel('Relative frequency')
plt.title('Relative frequency distribution of POS tags in SWDA corpus')

plt.savefig('../swda_POSdist.png')

In [0]:
# rotate the x tick markers to try to help reduce clutter
plt.xticks(rotation=50)
plt.bar(*zip(*sorted_WSJ), color='#7F00FF')
plt.ylabel('Relative frequency')
plt.title('Relative frequency distribution of POS tags in WSJ subset')

plt.savefig('../wsj_POSdist.png')

In [0]:
sorted_SWDA[:10]

In [0]:
sorted_WSJ[:10]

Personal pronouns (PRP) make a top ten appearance in the SWDA dataset, but not in the WSJ subset. This makes sense given the conversational nature of the SWDA dataset. Verbs, adverbs, and interjections, in the form of VBP, RB, and UH respectively also make an appearance in the SWDA dataset and not in the WSJ subset. Again, this is most likely due to the conversational nature of the SWDA corpus.

On the flipside, nouns and adjectives, in the form of NN, NNP, NNS, JJ, make an appearance in the top ten for the WSJ subset. While only nouns in the form of NN make an appearance in the SWDA corpus. This, as with the verbs and adjectives previously discussed, is most like due to the differences in the nature of the two datasets. The WSJ subset contains descriptions of current events, which would lead to a greater number of nouns and adjectives.

Now let's see if our suspicions about these dataset hold true. We will now statistically test whether or not these distrobutions are similiar using the kolmogorov-smirnov test. 

In [0]:
def treebank_tagList():
    """Build a list of POS tags for the NLTK subset of the WSJ Treebank."""
    l = []
    for fileid in treebank.fileids():
        for word in treebank.tagged_words(fileid):
            l.append(word[1])
    return l

def swda_tagList():
    """Gather and print relative frequency distribution of the tags."""
    l = []
    corpus = CorpusReader('swda')
    # Loop, counting tags:
    for utt in tqdm(corpus.iter_utterances(display_progress=False)):
        for x in np.asarray(utt.pos_lemmas(wn_lemmatize=False)):
            l.append(x[1])
    return l


In [0]:
wsj_tagList = treebank_tagList()
swda_tagList = swda_tagList()
print('The length of the SWDA tag list is {}. The length of the WSJ tag list is {}.'.format(len(swda_tagList), len(wsj_tagList)))

In [0]:
# run a kolmogorov-smirnov two sample test to see if these dataset came from the same distribution
# use this to estimate the similarity of the two distributions
# null hypothesis ( > p = 0.05) is that they do indeed come from the same distribution
ks_2samp(wsj_tagList, swda_tagList)

The result of the ks test is to reject our null hypotheses. That is, these two distributions do not come from the same distribution. This is expected given that I left in all of the extra POS tags that appear in the SWDA and not in the WSJ subset. In addition, we observed the substantial difference in length, which can influence the result of the KS test. Let's see what the result is if I narrow down the distributions to only contain POS tags that appear in both sets.

In [0]:
# generate new lists that contain only POS tags that occur in both sets
wsj_df = pd.DataFrame(wsj_tagList)
wsj_df.columns = ['POS']
swda_df = pd.DataFrame(swda_tagList)
swda_df.columns = ['POS']

swda_sharedList = swda_df[swda_df.POS.isin(wsj_df['POS'])]['POS'].tolist()
wsj_sharedList = wsj_df[wsj_df.POS.isin(swda_df['POS'])]['POS'].tolist()

print('The shared SWDA list contains {} values after dropping {} unshared values.'.format(len(swda_sharedList),
                                                                                        len(swda_df)-len(swda_sharedList)))
print('The shared WSJ list contains {} values after dropping {} unshared values.'.format(len(wsj_sharedList),
                                                                                        len(wsj_df)-len(wsj_sharedList)))

In [0]:
# find the shared POS between the two corpuseses and their associated probability of appearance
shared_POS_WSJ = []
for key in swda_POSdist:
    if key in WSJ_POSdist:
        shared_POS_WSJ.append((key, WSJ_POSdist[key]))

shared_POS_SWDA = []
for key in WSJ_POSdist:
    if key in swda_POSdist:
        shared_POS_SWDA.append((key, swda_POSdist[key]))
print('There are {} POS tags that shared between the two datasets.'.format(len(shared_POS_WSJ)))

In [0]:
# converts and combines the shared POS tags and their respective relative frequencies distributions
# into a neat dataframe
df_wsj = pd.DataFrame(shared_POS_WSJ)
df_wsj.columns = ['POS', 'WSJ']
df_swda = pd.DataFrame(shared_POS_SWDA)
df_swda.columns = ['POS', 'SWDA']
df = pd.merge(df_wsj, df_swda, on='POS')
# print first 10 rows, can remove [:10] to view all
df[:10]

In [0]:
# look at the shared tag occurrences as group barplot
df.index = df['POS']
df.plot(kind='bar', title='Shared POS tag distribution')

In [0]:
#Setting the width for the bars
width = 0.25 

# Bar plots
fig, ax = plt.subplots(figsize=(20,5))
plt.xticks(rotation=50)
plt.bar(df['POS'], 
        df['WSJ'], 
        width, 
        alpha=0.7, 
        color='#7F00FF', 
        label='WSJ')

plt.bar(df['POS'], 
        df['SWDA'], 
        width, 
        alpha=0.7, 
        color='#FFD700',
        label='SWDA')

plt.legend(loc='upper right')
plt.ylabel('Relative frequency')
plt.title('Relative frequency distribution of shared POS tags')
plt.savefig('../shared_dist.png')

In [0]:
# now lets pit these datasets up against each other as before maintaining the same null hypothesis,
# that these datasets come from the same distrobution
ks_2samp(wsj_sharedList, swda_sharedList)

In [0]:
# now lets try to equate for length by randomly sampling the same number of items from the SWDA list.
swda_tagCompare = np.random.choice(swda_sharedList, size=len(wsj_sharedList), replace=False)
len(swda_tagCompare)

In [0]:
ks_2samp(wsj_sharedList, swda_tagCompare)

So these two datasets have very different distributions of parts of speech. How do you think a language model trained on one set would perform on the other? Would it generalize well? Why or why not?

Now it's time to focus in on the dataset that we will be using in our language model.

In [0]:
# here we pull the text from all the utterances that contain trees
d = defaultdict(int)
corpus = CorpusReader('swda')
for utt in tqdm(CorpusReader('swda').iter_utterances(display_progress=False)): 
    if utt.trees:
        d[utt.text] += 1

In [0]:
# then we sort by number of observences and show the top 50
sorted_SWDAtext = sorted(d.items(), key=lambda x: x[1], reverse=True)
sorted_SWDAtext[:50]

Lots of different versions of the same words, this will get handled when we work on the model by lemmatizing the input.

In [0]:
# for completeness lets check the least frequent
sorted_SWDAtext[-10:]

In [0]:
# Stick it into a nice dataframe before plotting
df_text = pd.DataFrame(sorted_SWDAtext)
df_text.columns = ['Phrase', 'count']

In [0]:
#Setting the positions and width for the bars
width = 0.25 
    
# Plotting the bars
fig, ax = plt.subplots(figsize=(20,5))

plt.xticks(rotation=70)
plt.bar(df_text['Phrase'][:50], 
        df_text['count'][:50],
        width, 
        alpha=0.7, 
        color='#FFD700')
plt.ylabel('Number of occurrences')
plt.xlabel('Phrase')
plt.title('Top 50 phrases in SWDA corpus')
plt.savefig('../phrase_dist.png')

In [0]:
# let's double check the distributions of utterances including those without trees
# here we pull the text from all the utterances that contain trees
d_full = defaultdict(int)
for utt in tqdm(CorpusReader('swda').iter_utterances(display_progress=False)): 
    d_full[utt.text] += 1

[(
sorted_SWDAtextFull = sorted(d_full.items(), key=lambda x: x[1], reverse=True)
sorted_SWDAtextFull[:50]

In [0]:
shared_SWDAphrase = []
for key in d:
    if key in d_full:
        shared_SWDAphrase.append((key, d_full[key]))

df_sharedPhrase = pd.DataFrame(shared_SWDAphrase)
df_sharedPhrase.columns = ['Phrase', 'count']
df_sharedPhrase.sort_values(['count'], ascending=False)[:50]

In [0]:
#Setting the positions and width for the bars
width = 0.25 
    
# Plotting the bars
fig, ax = plt.subplots(figsize=(20,5))

plt.xticks(rotation=70)
plt.bar(df_text['Phrase'][:50], 
        df_text['count'][:50],
        width, 
        alpha=0.7, 
        color='#7F00FF',
        label='has tree')

plt.bar(df_sharedPhrase.sort_values(['count'], ascending=False)['Phrase'][:50], 
        df_sharedPhrase.sort_values(['count'], ascending=False)['count'][:50],
        width, 
        alpha=0.7, 
        color='#FFD700',
        label='tree + no tree')

plt.ylabel('Number of occurrences')
plt.xlabel('Phrase')
plt.title('Top 50 phrases in SWDA corpus with and without a tree')
plt.legend(loc='upper right')

Now we are going to switch gears and start some modeling. For this modeling problem we are going to predict the dialog speech act based on the utterance. 

Typically you would seperate the data exploration and modeling into two different scripts, but I kept them together for the ease of presentation.

In [0]:
# data preprocessings
import re
from collections import OrderedDict
from nltk.stem.snowball import SnowballStemmer

# model functions
from tensorflow import set_random_seed
from tensorflow.contrib.keras.api.keras.preprocessing.text import Tokenizer
from tensorflow.contrib.keras.api.keras.preprocessing.sequence import pad_sequences
from tensorflow.contrib.keras.api.keras.models import Sequential
from tensorflow.contrib.keras.api.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, \
    Embedding, Bidirectional

# tensorboard and tSNE
from time import time
from tensorflow.contrib.keras.api.keras.callbacks import TensorBoard
from sklearn.manifold import TSNE

In [0]:
# start by setting a random seed for reproducible results
#np.random.seed(1)
#set_random_seed(1)

# asks for current run number in order to organize logs and weight saves
run_number = int(input('What run number is this? '))
# calls the user to input a number of epoch to run through
nbEpochs = int(input('How many epochs of training would you like to run? '))

# In order to build a model to predict the damsl act tags associated with each utterance
# I first need to clean up the text a little bit. I am going to follow the guidelines used
# by Zhao & Kawahara (2018) and remove the , . / ! ? marks that appear in each utterance.
# It may be beneficial to keep them, maybe something to look into at a future date.

# Zhao, T., & Kawahara, T. (2018). A Unified Neural Architecture for Joint Dialog
# Act Segmentation and Recognition in Spoken Dialog System.
# In Proceedings of the 19th Annual SIGdial Meeting on Discourse and Dialogue (pp. 201-208).
#
# I am going to reconstruct each utterance from the pos_lemmas structure as they are already
# relatively clean.
excl = [',', '.', '!', '?', '/', '--']

print("Proceeding to gather data from the SWDA corpus...")

def phrase_act():
    """Gathers usable phrases from the swda corpus.
    Produces a dictionary of phrase:damsl_act pairs."""
    d = OrderedDict()
    corpus = CorpusReader('swda')
    for utt in tqdm(corpus.iter_utterances(display_progress=False), ascii=True):
        utterance = []
        for x in np.asarray(utt.pos_lemmas(wn_lemmatize=False)):
            if x[0] not in excl:
                utterance.append(x[0])
            phrase = " ".join(utterance)
            # remove space before ' and n'
            phrase_clean = re.sub(r"[\s+](?='|n')", '', phrase)
            # splits the phrase into single words
            phrase_split = phrase_clean.split()
            # stems the words as english and lowercases everything
            # the idea and code for stemming came from:
            # https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b
            stemmer = SnowballStemmer('english')
            stemmed_words = [stemmer.stem(word) for word in phrase_split]
            text = " ".join(stemmed_words)
        # I picked the damsl act tag as my speech action label because it reduces the number of action tags in the dataset
        # from over 200, to 43. it says 44 in http://compprag.christopherpotts.net/swda.html,
        # but my label count says 43.
        d[text] = utt.damsl_act_tag()
    return d

phrase_dict = phrase_act()

In [0]:
# change to list because pandas don't like dictionaries (they can't read)
phrase_list = []
for x,v in phrase_dict.items():
    phrase_list.append((x, v))
print("The total dataset is made up of {} samples.".format(len(phrase_list)))

# Here I will be adapting a medium post:
# https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b
# focused on yelp reviews, to the classification problem at hand.
df = pd.DataFrame(phrase_list)
df.columns = ['Phrase', 'Act']
# remove any empty rows
df = df.dropna()
# one hot encode the damsl act tags
labels = pd.get_dummies(df['Act'])
labels = labels.values
print("There are {} possible acts in this dataset.".format(len(labels[0])))
print("Tokenizing and generating sequences...")

# Create sequence, limiting vocab size to 15000
vocabulary_size = 15000
tokenizer = Tokenizer(num_words=vocabulary_size)
tokenizer.fit_on_texts(df['Phrase'])

# and cutting phrase length off at 20 words. This will pad sequences below 20 up to 20, putting the
# meaningful text at the end of the sequence.
sequences = tokenizer.texts_to_sequences(df['Phrase'])
data = pad_sequences(sequences, maxlen=20)
print("Done.")


In [0]:
tokenizer.word_counts

In [0]:
tokenizer.word_index

In [0]:
count = 0
for x in tokenizer.word_index.keys():
    count += 1
print('The total count of different words in the tokenizer is {}.'.format(count))

In [0]:
df['Phrase'][:10]

In [0]:
df['Act'][:10]

In [0]:
data[:10]

In [0]:
# create a model that predicts the speech act of the provided utterance...
# https://keras.io/layers/ contains all the documentation for every layer

# initiate the model
model = Sequential()
# Embedding layer is first in this sort of task, with vocab size, embedding size, and sequence length as parameters
# embeds in a hundred dimensions, can change if you want a more sparse embedding.
model.add(Embedding(vocabulary_size, 100, input_length=20, name="Embed"))
# adding the 1D convolutional layer will speed up training
model.add(Conv1D(64, 5, activation='relu', name="Conv1"))
# grab the max activation within pool size, stride size defaults to pool size which is 4 in this case
model.add(MaxPooling1D(pool_size=4, name="Max1"))
# single LSTM layer with 20% dropout, default activation is tanh, default recurrent activation is hard sigmoid
# this layer initializes with a glorot uniform distribution in the layer and an orthogonal initializer in the recurrence
model.add(Bidirectional(LSTM(100, activation='tanh', dropout=0.2, recurrent_dropout=0.2, name="BI-LSTM1")))
# use softmax activation function on the final dense (fully connected) layer
# because it works well with multiclass classification
model.add(Dense(len(labels[0]), activation='softmax', name="FC1"))

# compile the network with categorical cross entropy loss function
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
# save logs for use with tensorboard
tensorboard = TensorBoard(log_dir='../logs/run{}/{}'.format(run_number, time()), histogram_freq=0,
                          write_graph=True, write_grads=True)

In [0]:
# train the model on an 80/20 split for a user decided number of epochs
# defaults to a 32 batch size
model.fit(data, labels,
          batch_size=16,
          validation_split=0.2, epochs=nbEpochs,
          verbose=1, callbacks=[tensorboard])

In [0]:
# When you have a categorical problem that contains a large number of categories
# and your accuracy shoots up to 60-70% after just a couple batches, you should be
# aware that something is amiss...the first thing you should do is check the 
# distribution of categories...
df['Act'].value_counts()

In [0]:
model.save("../model_run{}_epochs{}.h5".format(run_number, nbEpochs))
print("Saved model to parent folder.")

print('To view tensorboard, open up either Firefox or Chrome and type localhost:6006 in the address bar.')
print('Then run `tensorboard --logdir=logs/run{}/` in your terminal.'.format(run_number))


In [0]:
# finally lets take a look at the word embeddings using tSNE

# first we need to grab the weights from the embedding layer
word_embds = model.layers[0].get_weights()[0]

print('Generating a visualization of the word embeddings of your model...')
# then use tSNE to reduce dimensionality from 100 to 2
# the shape of this structure will change as the number of epochs increase
tsne_embds = TSNE(n_components=2).fit_transform(word_embds)

# then plot
plt.scatter(tsne_embds[:,0], tsne_embds[:,1])
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('tSNE reduction of word embeddings to 2 dimensions')

plt.savefig('../tSNE_wordembeds.png')