## Lanugage Modeling Using CNNs and Gated Linear Units

Based on the paper: <i>Language Modeling with Gated Convolutional Networks</i>. The sequential nature of the text is modeled by causal convolutions - which use left-zero-padding.

In [1]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy,CategoricalCrossentropy
import numpy as np
import nltk
from gensim.models import Word2Vec
from collections import Counter
from sklearn.model_selection import train_test_split

tf.keras.backend.set_floatx('float64')
import warnings
warnings.filterwarnings('ignore')

### Data Preprocessing

In [2]:
from nltk.corpus import gutenberg
words = gutenberg.words('melville-moby_dick.txt')
sents = gutenberg.sents('melville-moby_dick.txt')

In [32]:
# training word embeddings 
w2v_model = Word2Vec(sents,size=128,window=5,min_count=1,workers=4)

In [4]:
counted = Counter(words)
common_words = [key for key in counted.keys() if counted[key]>3]
len(words),len(set(words)),len(common_words)

(260819, 19317, 5401)

In [5]:
common_words_final = [] # removing words which don't have embeddings
for word in common_words:
    if word in w2v_model:
        common_words_final.append(word)
print(len(common_words_final))

5398


In [6]:
# creating an encoder and decoder for the words
encoder = {}
decoder = {}
i = 0 # index of an encoding vector
for word in common_words_final:
    encoder[word] = i
    decoder[i] = word
    i += 1

In [7]:
train_words = [word for word in words if word in encoder] # used for training, only considering most common words
print(len(train_words))

240233


In [8]:
def encode_word_y(word):
    """ encodes word using encoder to one-hot encoding
    """
    word_i = encoder[word]
    word_emb = np.zeros((5398))
    word_emb[word_i] = 1
    return word_emb

In [9]:
x = []
y = []
for i in range(0,len(train_words)-9,9): # some overlap
    temp_x = [w2v_model[word] for word in train_words[i:i+10]]
    temp_y = [encode_word_y(word) for word in train_words[i+1:i+10+1]]
    x.append(np.array(temp_x))
    y.append(np.array(temp_y))

In [10]:
x = np.array(x)
y = np.array(y)
x.shape,y.shape

((26692, 10, 128), (26692, 10, 5398))

### Language Model

In [11]:
def linear_convolution_step(x,down,up):
    """ Applies linear convolution step with bottleneck
    """
    in1 = tfa.layers.WeightNormalization(tf.keras.layers.Conv1D(filters=down,kernel_size=1,activation=None))(x)
    in2 = tfa.layers.WeightNormalization(tf.keras.layers.Conv1D(filters=down,kernel_size=5,activation=None))(in1)
    in3 = tfa.layers.WeightNormalization(tf.keras.layers.Conv1D(filters=up,kernel_size=1,activation=None))(in2)
    return in3

In [12]:
def residual_block(x,down,up):
    """ Residual blocks within the network
    args:
        down: number of filters for downsampling channels
        up: number of filters for upsampling channels
    """
    pad = tf.keras.layers.ZeroPadding1D(padding=(4,0))(x)
    inter = linear_convolution_step(pad,down,up)
    gate = linear_convolution_step(pad,down,up)
    gate = tf.math.sigmoid(gate)
    out = tf.math.add(tf.multiply(inter,gate),x) # x same shape as output, so no kernel_size=1 convolution
    return out

In [17]:
def gated_conv_net():
    """ Network, similar to what was introduced in paper (GCNN-8B), but not as deep/smaller overall
        Output of network is the same as the input, & each output is the predicted next word given past context
    """
    x = tf.keras.layers.Input(shape=(10,128))
    
    # first layer
    pad1 = tf.keras.layers.ZeroPadding1D(padding=(4,0))(x)
    inter1 = tfa.layers.WeightNormalization(tf.keras.layers.Conv1D(filters=256,kernel_size=5,activation=None))(pad1) # linear convolution
    gate1 = tfa.layers.WeightNormalization(tf.keras.layers.Conv1D(filters=256,kernel_size=5,activation=None))(pad1)
    gate1 = tf.math.sigmoid(gate1) # applying sigmoid activation to create gate
    in1 = tf.keras.layers.Conv1D(filters=256,kernel_size=1)(x) # residual is increased in size before added to out
    out1 = tf.math.add(tf.multiply(inter1,gate1),in1) # adding input to output for residual block
    
    # second layer
    out2 = residual_block(out1,down=64,up=256)
    out3 = residual_block(out2,down=64,up=256)
    
    # third layer
    out4 = residual_block(out3,down=128,up=256)
    out5 = residual_block(out4,down=128,up=256)
    
    out = tf.keras.layers.Conv1D(filters=5398,kernel_size=1,activation=None)(out5)
    model = tf.keras.models.Model(inputs=x,outputs=out)
    
    return model

In [18]:
model = gated_conv_net()

In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 10, 128)]    0                                            
__________________________________________________________________________________________________
zero_padding1d_2 (ZeroPadding1D (None, 14, 128)      0           input_3[0][0]                    
__________________________________________________________________________________________________
weight_normalization_5 (WeightN (None, 10, 256)      328449      zero_padding1d_2[0][0]           
__________________________________________________________________________________________________
weight_normalization_4 (WeightN (None, 10, 256)      328449      zero_padding1d_2[0][0]           
______________________________________________________________________________________________

In [20]:
def cost_function(labels,logits): # reduce mean over batches
    return tf.math.reduce_mean(tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(labels,logits),axis=-1))

In [21]:
optimizer=Adam(lr=0.0001)

In [22]:
x_train,_,y_train,_ = train_test_split(x,y,test_size=0.0) # shuffling the data

In [24]:
for _ in range(1): # training the model
    for i in range(0,len(x)-100,100): # batch size of 100
        x_subset = x_train[i:i+100]
        y_subset = y_train[i:i+100]
        with tf.GradientTape() as tape:
            prediction = model(x_subset,training=True)
            loss = cost_function(y_subset,prediction)
        print(float(loss))
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

84.74822533160572
84.35508015369291
84.25782314356826
83.76732313753173
82.639131536272
80.8979455918617
80.3431090709479
80.01507155914366
80.75396875536576
79.31840045479245
77.50355033901897
77.66948304562254
75.4860981252722
74.9150687118081
74.43922094182699
75.37967582727882
73.26798600278995
72.93294215336195
71.9152208255876
72.93485900168491
70.54453616722377
70.63622149383244
68.81085899489392
69.09649443806408
67.74491888391863
68.74428725370268
67.46187268920768
67.6461981520737
66.72930756781604
65.40196788177057
65.71130256600296
65.79401166100088
64.99836275005312
65.81162019821355
66.54060789134267
65.6869308058516
65.67879516636614
63.65897870330027
63.502684274030656
64.86879990595966
64.36028458847449
63.16489137618575
62.77196092908938
61.84960123620827
64.11596904775539
63.51709825899983
63.749859873938306
62.09140461531026
62.74954054480033
62.15953446043096
62.79326452350074
62.55458567970387
63.148238394833314
61.576232860301076
61.61917859642953
62.290061675911

### Language Model Generation

In [94]:
start = y_train[90]
words = [decoder[np.argmax(v)] for v in start]
words_emb = [w2v_model[word] for word in words]
" ".join(words),start.shape

('margin of the lake -- evinced a wondrous and confidence', (10, 5398))

In [95]:
for _ in range(100): # simulating 50 word predictions
    words_emb_vec = np.expand_dims(np.array(words_emb),axis=0) # (1,10,128)
    predicted_words = model(words_emb_vec)
    predicted_words = tf.nn.softmax(predicted_words,axis=-1)
    pred = int(np.random.choice(list(range(0,5398)),size=1,p=np.squeeze(predicted_words.numpy(),axis=0)[-1]))
    #pred = int(np.argmax(np.squeeze(predicted_words.numpy(),axis=0)[-1])) # argmax prediction
    pred_word = decoder[pred]
    words.append(pred_word)
    words_emb.append(w2v_model[pred_word])
    words_emb = words_emb[1:]

In [96]:
" ".join(words)

'margin of the lake -- evinced a wondrous and confidence and hammock . Cabaco again only would for him felt feature of in the vapoury use voices and If like the inhabitants there it he surface were meet with also rather it that voyage descried below ) we him if too , the great measured conduct case been and had . , Your , with this , will him . completely since them As animated , extreme of him scuttle in a pondered from a , in that these the . , till with choice , these In a clinging idea in to then . flames commanded indeed -- after'