### Word Embedding Models:

In this script we construct the models for utterance classification tested in our analysis. Specifically you will find here the implementation of several models:

1. MLP (2 Hidden Layers) + Pre-Trained Static Embeddings
2. MLP (2 Hidden Layers) + Pre-Trained Trainable Embeddings
3. MLP + Pre-Trained Static Embeddings + Simple Attention Mechanism
4. MLP + Pre-Trained Trainable Embeddings + Simple Attention Mechanism
5. MLP + Pre Trained Static Embeddings + Differentiated Attention

Additionally, you can find the models:

6. MLP (1 Hidden Layer + Dropout) + Pre-Trained Static Embeddings
7. MLP (1 Hidden Layer + Dropout) + Pre-Trained Trainable Embeddings

Models 3 to 7 are compared in the article "Automatic Content Analysis of Computer-Supported Collaborative Inquiry-Based Learning Using Deep Networks and Attention Mechanisms" (Pablo Uribe, Abelino Jiménez, Roberto Araya, Joni Lämsä, Raija Hämäläinen, Jouni Viiri), while models 1 to 5 are compared in the article "Deep Networks for Collaboration Analytics: Automatic Coding of Face-to-Face Conversations in a Computer-Supported Inquiry-Based Learning Context" (Joni Lämsä, Pablo Uribe, Abelino Jiménez, Raija Hämäläinen, Daniela Caballero, Roberto Araya). 

In [20]:
#Importing Libraries
import numpy as np
import os
import pandas as pd
from numpy import array
import keras
import seaborn as sns
import matplotlib.pyplot as plt
import re
import num2words
from tensorflow.keras import backend
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Lambda, Input, concatenate,BatchNormalization, Activation, Multiply
from keras.layers.embeddings import Embedding
from keras.layers import LSTM,Conv1D,MaxPooling1D, Bidirectional, GRU, RepeatVector, TimeDistributed, SimpleRNN
from keras.models import Model
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from itertools import combinations


%matplotlib inline

In [6]:
#Set local path of your computer where data files are saved
local_path = r'/Users/pablouribepizarro/Desktop/CIAE/CIBL'

In [11]:
files_path = []
# r=root, d=directories, f = files
for r, d, f in os.walk(local_path+'/Raw/inquiry_lessons'):
    for file in f:
        if '.xlsx' in file:
            files_path.append(os.path.join(r, file))
            
n_files = len(files_path)
print('Total Files: {}'.format(n_files))

Total Files: 11


In [12]:
#Build a list of data frames containing each group's transcription:
dfs = []
for path in files_path:
    df = pd.read_excel(path)
    df = df[np.isfinite(df['Phase'])]
    df['Phase'] = df['Phase'].astype(int)-1
    #Transform raw numbers into digits:
    df['Utterance'] = df['Utterance'].apply(lambda row: re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), row))
    #Consider '?' as a new word:
    df['Utterance'] = df['Utterance'].apply(lambda row: row.replace('?',' ?'))
    #Add the number of words:
    df['Number of Words'] = df.apply(lambda row: len(row['Utterance'].split()), axis = 1)
    dfs.append(df)
docs_size = pd.Series([df.shape[0] for df in dfs])
dfs[0].head()

Unnamed: 0,StudentID,Student,time_start,time_end,Phase,Phase_start,Phase_end,Utterance,Number of Words
0,O2,,,,0,1.0,,Satunnaiskävely.,1
1,O1,,,,0,,,[lukee tehtävänantoa],2
2,O1,,,,0,,1.0,Voi voi. [käynnistää videon],4
3,O1,,,,0,1.0,,"Missä se vaeltaa siellä ? Onks se, ei. Ei, se ...",17
4,O2,,,,0,,,[epäselvä],1


In [14]:
#Join all documents:
docs = [' '.join(list(df['Utterance'].astype(str))) for df in dfs]
#Set a tokenizer (the character ? is considered as a new words, thus it is not filtered), only top 2000 words:
t = Tokenizer(filters='¡!"\'#$%&()*+,./:;<=>?@[\\]^_`{|}~\t\n',num_words=2000)
#Fit the tokenizer:
t.fit_on_texts(docs)

In [15]:
#Load the Embeddings
from gensim.models import KeyedVectors
path = local_path +'/Raw/finnish_4B_parsebank_skgram.bin'
wv = KeyedVectors.load_word2vec_format(path,binary=True, encoding='UTF-8',limit=500000,unicode_errors='replace') 

In [16]:
#Build the Embedding Matrix
max_vocab = 10000
embedding_matrix = np.zeros((max_vocab, 200))
for word, i in t.word_index.items():
    try:
        embedding_vector = wv.get_vector(word)
        embedding_matrix[i] = embedding_vector
    except:
        pass  

#### 1. MLP (2 hidden Layers) + Pre Trained Static Embeddings:

In [18]:
#Set vocab size of the embedding matrix
vocab_size = 10000
#Set utterance length
max_length = 20

#Utterance inputs
utterance_input = Input(shape=(max_length,), dtype='int32', name='utterance_input')
#Previous
previous_utterance_input = Input(shape=(max_length,), dtype='int32', name='p_utterance_input')
#Next
next_utterance_input = Input(shape=(max_length,), dtype='int32', name='n_utterance_input')

#Relative Position input
rel_position_input = Input(shape=(1,), dtype='float32', name='rel_position_input')
#Number of words input
n_words_input = Input(shape=(1,), dtype='float32', name='n_words_input')

#Embedding Layer: Trainable = False
emb = Embedding(output_dim=200, input_dim=vocab_size, input_length=max_length, weights=[embedding_matrix],trainable=False)

embedded_previous = emb(previous_utterance_input)
embedded_utterance = emb(utterance_input)
embedded_next = emb(next_utterance_input)

#Concatenate Previous, Current and Next Utterance Embeddings
x = concatenate([embedded_previous,embedded_utterance,embedded_next],axis = 1)

x = Lambda(lambda x: K.sum(x, axis=1))(x)

#Add the Relative Position and Number of Words Inputs
x = concatenate([x,rel_position_input,n_words_input])

#Add Hidden Layers
x = Dense(64, activation='relu')(x)
x = Dense(64, activation='relu')(x)

main_output = Dense(5, activation='softmax', name='main_output')(x)

model_1 = Model(inputs=[previous_utterance_input,utterance_input,next_utterance_input, 
                             rel_position_input,n_words_input], outputs=[main_output])
model_1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_1.name = 'pe'

#Save the random weigths of the model in h5 format
model_1.save_weights(local_path+'/model_weights/'+str(model_1.name)+'.h5')
#Svae the model in h5 format
model_1.save(local_path+'/models/'+str(model_1.name)+'.h5')

#### 2. MLP (2 hidden Layers) + Pre Trained Trainable Embeddings:

In [19]:
#Set vocab size of the embedding matrix
vocab_size = 10000
#Set utterance length
max_length = 20

#Utterance inputs
utterance_input = Input(shape=(max_length,), dtype='int32', name='utterance_input')
#Previous
previous_utterance_input = Input(shape=(max_length,), dtype='int32', name='p_utterance_input')
#Next
next_utterance_input = Input(shape=(max_length,), dtype='int32', name='n_utterance_input')

#Relative Position input
rel_position_input = Input(shape=(1,), dtype='float32', name='rel_position_input')
#Number of words input
n_words_input = Input(shape=(1,), dtype='float32', name='n_words_input')

#Embedding Layer: Trainable = True
emb = Embedding(output_dim=200, input_dim=vocab_size, input_length=max_length, weights=[embedding_matrix],trainable=True)

embedded_previous = emb(previous_utterance_input)
embedded_utterance = emb(utterance_input)
embedded_next = emb(next_utterance_input)

#Concatenate Previous, Current and Next Utterance Embeddings
x = concatenate([embedded_previous,embedded_utterance,embedded_next],axis = 1)

x = Lambda(lambda x: K.sum(x, axis=1))(x)

#Add the Relative Position and Number of Words Inputs
x = concatenate([x,rel_position_input,n_words_input])

#Add Hidden Layers
x = Dense(64, activation='relu')(x)
x = Dense(64, activation='relu')(x)

main_output = Dense(5, activation='softmax', name='main_output')(x)

model_2 = Model(inputs=[previous_utterance_input,utterance_input,next_utterance_input, 
                             rel_position_input,n_words_input], outputs=[main_output])
model_2.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_2.name = 'pte'

#Save the random weigths of the model in h5 format
model_2.save_weights(local_path+'/model_weights/'+str(model_2.name)+'.h5')
#Svae the model in h5 format
model_2.save(local_path+'/models/'+str(model_2.name)+'.h5')

#### 3. MLP + Pre Trained Static Embeddings + Simple Attention:

In [21]:
vocab_size = 10000
max_length = 20

from keras.activations import softmax

def softMaxAxis1(x):
    return softmax(x,axis=1)

#Utterance inputs
utterance_input = Input(shape=(max_length,), dtype='int32', name='utterance_input')
#Previous
previous_utterance_input = Input(shape=(max_length,), dtype='int32', name='p_utterance_input')
#Next
next_utterance_input = Input(shape=(max_length,), dtype='int32', name='n_utterance_input')

#Relative Position input
rel_position_input = Input(shape=(1,), dtype='float32', name='rel_position_input')
#Number of words input
n_words_input = Input(shape=(1,), dtype='float32', name='n_words_input')

#Embedding Layer: Trainable = False
emb = Embedding(output_dim=200, input_dim=vocab_size, input_length=max_length, weights=[embedding_matrix],trainable=False)

embedded_previous = emb(previous_utterance_input)
embedded_utterance = emb(utterance_input)
embedded_next = emb(next_utterance_input)

#Concatenate Previous, Current and Next Utterance Embeddings
x = concatenate([embedded_previous,embedded_utterance,embedded_next],axis = 1)

#Define the Attention Mechanism:
attention = TimeDistributed(Dense(1))
attention_weights = attention(x)
attention_probs = Activation(softMaxAxis1)(attention_weights)

#Multiply Attention Probabilities with the respective embeddings:
weighted_encoddings = Lambda(lambda x: x[0] * x[1])([x, attention_probs])

#Sum Layer:
decoder = Lambda(lambda x: K.sum(x, axis=1))
x = decoder(weighted_encoddings)

#Add the Relative Position and Number of Words Inputs
x = concatenate([x,rel_position_input,n_words_input])

#Add a hidden Layer
x = Dense(64, activation='relu')(x)

main_output = Dense(5, activation='softmax', name='main_output')(x)


model_3 = Model(inputs=[previous_utterance_input,utterance_input,next_utterance_input, 
                             rel_position_input,n_words_input], outputs=[main_output])
model_3.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_3.name = 'pe_simple_at'

#Save the random weigths of the model in h5 format
model_3.save_weights(local_path+'/model_weights/'+str(model_3.name)+'.h5')
#Svae the model in h5 format
model_3.save(local_path+'/models/'+str(model_3.name)+'.h5')

#### 4. MLP + Pre Trained Trainable Embeddings + Simple Attention:

In [None]:
vocab_size = 10000
max_length = 20

from keras.activations import softmax

def softMaxAxis1(x):
    return softmax(x,axis=1)

#Utterance inputs
utterance_input = Input(shape=(max_length,), dtype='int32', name='utterance_input')
#Previous
previous_utterance_input = Input(shape=(max_length,), dtype='int32', name='p_utterance_input')
#Next
next_utterance_input = Input(shape=(max_length,), dtype='int32', name='n_utterance_input')

#Relative Position input
rel_position_input = Input(shape=(1,), dtype='float32', name='rel_position_input')
#Number of words input
n_words_input = Input(shape=(1,), dtype='float32', name='n_words_input')

#Embedding Layer: Trainable = True
emb = Embedding(output_dim=200, input_dim=vocab_size, input_length=max_length, weights=[embedding_matrix],trainable=True)

embedded_previous = emb(previous_utterance_input)
embedded_utterance = emb(utterance_input)
embedded_next = emb(next_utterance_input)

#Concatenate Previous, Current and Next Utterance Embeddings
x = concatenate([embedded_previous,embedded_utterance,embedded_next],axis = 1)

#Define the Attention Mechanism:
attention = TimeDistributed(Dense(1))
attention_weights = attention(x)
attention_probs = Activation(softMaxAxis1)(attention_weights)

#Multiply Attention Probabilities with the respective embeddings:
weighted_encoddings = Lambda(lambda x: x[0] * x[1])([x, attention_probs])

#Sum Layer:
decoder = Lambda(lambda x: K.sum(x, axis=1))
x = decoder(weighted_encoddings)

#Add the Relative Position and Number of Words Inputs
x = concatenate([x,rel_position_input,n_words_input])

#Add a hidden Layer
x = Dense(64, activation='relu')(x)

main_output = Dense(5, activation='softmax', name='main_output')(x)


model_3 = Model(inputs=[previous_utterance_input,utterance_input,next_utterance_input, 
                             rel_position_input,n_words_input], outputs=[main_output])
model_3.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_3.name = 'pte_simple_at'

#Save the random weigths of the model in h5 format
model_3.save_weights(local_path+'/model_weights/'+str(model_3.name)+'.h5')
#Svae the model in h5 format
model_3.save(local_path+'/models/'+str(model_3.name)+'.h5')

#### 5. MLP + Pre Trained Static Embeddings + Differentiated Attention:

In [23]:
vocab_size = 10000
max_length = 20

from keras.activations import softmax

def softMaxAxis1(x):
    return softmax(x,axis=1)

#Utterance inputs
utterance_input = Input(shape=(max_length,), dtype='int32', name='utterance_input')
#Previous
previous_utterance_input = Input(shape=(max_length,), dtype='int32', name='p_utterance_input')
#Next
next_utterance_input = Input(shape=(max_length,), dtype='int32', name='n_utterance_input')

#Relative Position input
rel_position_input = Input(shape=(1,), dtype='float32', name='rel_position_input')
#Number of words input
n_words_input = Input(shape=(1,), dtype='float32', name='n_words_input')

#Embedding Layer: Trainable = False
emb = Embedding(output_dim=200, input_dim=vocab_size, input_length=max_length, weights=[embedding_matrix],trainable=False)

embedded_previous = emb(previous_utterance_input)
embedded_utterance = emb(utterance_input)
embedded_next = emb(next_utterance_input)

#Concatenate Previous, Current and Next Utterance Embeddings
utterances = concatenate([embedded_previous,embedded_utterance,embedded_next],axis = 1)

#Differentiated Attention Mechanism

attention_1 = TimeDistributed(Dense(1))
attention_parameters_1 = attention_1(utterances)
attention_weigths_1 = Activation(softMaxAxis1, name = 'attention_1')(attention_parameters_1)

weighted_encoddings_1 = Lambda(lambda x: x[0] * x[1])([utterances, attention_weigths_1])

attention_2 = TimeDistributed(Dense(1))
attention_parameters_2 = attention_2(utterances)
attention_weigths_2 = Activation(softMaxAxis1, name = 'attention_2')(attention_parameters_2)

weighted_encoddings_2 = Lambda(lambda x: x[0] * x[1])([utterances, attention_weigths_2])

attention_3 = TimeDistributed(Dense(1))
attention_parameters_3 = attention_3(utterances)
attention_weigths_3 = Activation(softMaxAxis1, name = 'attention_3')(attention_parameters_3)

weighted_encoddings_3 = Lambda(lambda x: x[0] * x[1])([utterances, attention_weigths_3])

attention_4 = TimeDistributed(Dense(1))
attention_parameters_4 = attention_4(utterances)
attention_weigths_4 = Activation(softMaxAxis1, name = 'attention_4')(attention_parameters_4)

weighted_encoddings_4 = Lambda(lambda x: x[0] * x[1])([utterances, attention_weigths_4])

attention_5 = TimeDistributed(Dense(1))
attention_parameters_5 = attention_5(utterances)
attention_weigths_5 = Activation(softMaxAxis1, name = 'attention_5')(attention_parameters_5)

weighted_encoddings_5 = Lambda(lambda x: x[0] * x[1])([utterances, attention_weigths_5])

#Sum Layer:
decoder = Lambda(lambda x: K.sum(x, axis=1))

#Output:

x_1 = decoder(weighted_encoddings_1)
x_1 = concatenate([x_1,rel_position_input,n_words_input])
x_1 = Dense(64, activation='relu')(x_1)
output_1 = Dense(1, name='output_1')(x_1)

x_2 = decoder(weighted_encoddings_2)
x_2 = concatenate([x_2,rel_position_input,n_words_input])
x_2 = Dense(64, activation='relu')(x_2)
output_2 = Dense(1, name='output_2')(x_2)

x_3 = decoder(weighted_encoddings_3)
x_3 = concatenate([x_3,rel_position_input,n_words_input])
x_3 = Dense(64, activation='relu')(x_3)
output_3 = Dense(1, name='output_3')(x_3)

x_4 = decoder(weighted_encoddings_4)
x_4 = concatenate([x_4,rel_position_input,n_words_input])
x_4 = Dense(64, activation='relu')(x_4)
output_4 = Dense(1,name='output_4')(x_4)

x_5 = decoder(weighted_encoddings_5)
x_5 = concatenate([x_5,rel_position_input,n_words_input])
x_5 = Dense(64, activation='relu')(x_5)
output_5 = Dense(1,name='output_5')(x_5)

output = concatenate([output_1,output_2,output_3,output_4,output_5])
main_output = Activation('softmax', name= 'main_output')(output)

model_5 = Model(inputs=[previous_utterance_input,utterance_input,next_utterance_input, 
                             rel_position_input,n_words_input], outputs=[main_output])

model_5.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_5.name = 'pe_dif_at'

#Save the random weigths of the model in h5 format
model_5.save_weights(local_path+'/model_weights/'+str(model_5.name)+'.h5')
#Svae the model in h5 format
model_5.save(local_path+'/models/'+str(model_5.name)+'.h5')

#### 6. MLP (1 Hidden Layer + Dropout) + Pre Trained Static Embeddings:

In [28]:
#Set vocab size of the embedding matrix
vocab_size = 10000
#Set utterance length
max_length = 20

#Utterance inputs
utterance_input = Input(shape=(max_length,), dtype='int32', name='utterance_input')
#Previous
previous_utterance_input = Input(shape=(max_length,), dtype='int32', name='p_utterance_input')
#Next
next_utterance_input = Input(shape=(max_length,), dtype='int32', name='n_utterance_input')

#Relative Position input
rel_position_input = Input(shape=(1,), dtype='float32', name='rel_position_input')
#Number of words input
n_words_input = Input(shape=(1,), dtype='float32', name='n_words_input')

#Embedding Layer: Trainable = False
emb = Embedding(output_dim=200, input_dim=vocab_size, input_length=max_length, weights=[embedding_matrix],trainable=False)

embedded_previous = emb(previous_utterance_input)
embedded_utterance = emb(utterance_input)
embedded_next = emb(next_utterance_input)

#Concatenate Previous, Current and Next Utterance Embeddings
x = concatenate([embedded_previous,embedded_utterance,embedded_next],axis = 1)

x = Lambda(lambda x: K.sum(x, axis=1))(x)

#Add the Relative Position and Number of Words Inputs
x = concatenate([x,rel_position_input,n_words_input])

#Add One Hidden Layer
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)

main_output = Dense(5, activation='softmax', name='main_output')(x)

model_6 = Model(inputs=[previous_utterance_input,utterance_input,next_utterance_input, 
                             rel_position_input,n_words_input], outputs=[main_output])
model_6.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_6.name = 'pe_1l_do'

#Save the random weigths of the model in h5 format
model_6.save_weights(local_path+'/model_weights/'+str(model_6.name)+'.h5')
#Svae the model in h5 format
model_6.save(local_path+'/models/'+str(model_6.name)+'.h5')

#### 7. MLP (1 Hidden Layer + Dropout) + Pre Trained Trainable Embeddings:

In [30]:
#Set vocab size of the embedding matrix
vocab_size = 10000
#Set utterance length
max_length = 20

#Utterance inputs
utterance_input = Input(shape=(max_length,), dtype='int32', name='utterance_input')
#Previous
previous_utterance_input = Input(shape=(max_length,), dtype='int32', name='p_utterance_input')
#Next
next_utterance_input = Input(shape=(max_length,), dtype='int32', name='n_utterance_input')

#Relative Position input
rel_position_input = Input(shape=(1,), dtype='float32', name='rel_position_input')
#Number of words input
n_words_input = Input(shape=(1,), dtype='float32', name='n_words_input')

#Embedding Layer: Trainable = True
emb = Embedding(output_dim=200, input_dim=vocab_size, input_length=max_length, weights=[embedding_matrix],trainable=True)

embedded_previous = emb(previous_utterance_input)
embedded_utterance = emb(utterance_input)
embedded_next = emb(next_utterance_input)

#Concatenate Previous, Current and Next Utterance Embeddings
x = concatenate([embedded_previous,embedded_utterance,embedded_next],axis = 1)

x = Lambda(lambda x: K.sum(x, axis=1))(x)

#Add the Relative Position and Number of Words Inputs
x = concatenate([x,rel_position_input,n_words_input])

#Add One Hidden Layer
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)

main_output = Dense(5, activation='softmax', name='main_output')(x)

model_7 = Model(inputs=[previous_utterance_input,utterance_input,next_utterance_input, 
                             rel_position_input,n_words_input], outputs=[main_output])
model_7.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model_7.name = 'pte_1l_do'

#Save the random weigths of the model in h5 format
model_7.save_weights(local_path+'/model_weights/'+str(model_7.name)+'.h5')
#Svae the model in h5 format
model_7.save(local_path+'/models/'+str(model_7.name)+'.h5')