In [1]:

import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


def one_hot(array):
    unique, inverse = np.unique(array, return_inverse=True)
    onehot = np.eye(unique.shape[0])[inverse]
    return unique,onehot
def slice_sequence(sentence):
    return re.findall(r'"|\'|\w+|\S+', sentence.lower())
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/glove6b/glove.6B.200d.txt
/kaggle/input/glove6b/glove.6B.50d.txt
/kaggle/input/glove6b/glove.6B.300d.txt
/kaggle/input/glove6b/glove.6B.100d.txt
/kaggle/input/ise-competition-1/sample_submission/sample_submission.csv
/kaggle/input/ise-competition-1/test/test.csv
/kaggle/input/ise-competition-1/train/train.csv


In [2]:
maxLen=64
embed=200
epochs=20


In [3]:
train_csv=pd.read_csv("/kaggle/input/ise-competition-1/train/train.csv")
train=train_csv.to_numpy(dtype=str)
id = train[:,0]
seq = train[:,1]
out = train[:,2]
possible_output,out_oh=one_hot(out)
print(possible_output)


['EAP' 'HPL' 'MWS']


In [4]:
longest = max(seq, key=lambda x: len(re.findall(r'\w+|\S+', x.lower())))
shortest= min(seq, key=lambda x: len(re.findall(r'\w+|\S+', x.lower())))
#print (longest)
print(shortest)
print(slice_sequence(shortest))
#maxLen = len(re.findall(r'\w+|\S+', longest.lower()))
#maxLen = min(maxLen,200)

"PIQUANT EXPRESSIONS.
['"', 'piquant', 'expressions', '.']


In [5]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(f'/kaggle/input/glove6b/glove.6B.{embed}d.txt')

In [6]:
def subseq_to_indices(seq,word_to_index,max_len):
    indices=np.zeros(max_len)
    j=0
    for w in seq:
        # if w exists in the word_to_index dictionary
        if w in word_to_index:
        # Set the (i,j)th entry of X_indices to the index of the correct word.
            indices[j] = word_to_index[w]
            j = j+1    
    return indices
    
def process_data(id,seq,out,word_to_index,max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` 
    if sentence is longer than maxlength then it will be split up to multiple sequences
    """
    m = seq.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    #X_indices = np.zeros((0,max_len))
    id_list=[]
    indices_list=[]
    out_list=[]

    for i in range(m):    # loop over training examples
        # Convert the ith training sentence to lower case and split it into words
        sentence_words= slice_sequence(seq[i])
        slen=len(sentence_words)
        # Initialize j to 0
        j = 0
        # Loop over the words of sentence_words
        if slen<=max_len :
            indices=subseq_to_indices(sentence_words,word_to_index,max_len)
            indices_list.append(indices)
            id_list.append(id[i])
            if out is not None:
                out_list.append(out[i])
        else:
            count = slen // max_len
            last  = slen %  max_len
            for c in range(count):
                subseq=sentence_words[c*max_len:(c+1)*max_len]
                indices=subseq_to_indices(subseq,word_to_index,max_len)
                indices_list.append(indices)
                id_list.append(id[i])
                if out is not None:
                    out_list.append(out[i])
            if last!=0:
                subseq=sentence_words[-max_len:]
                indices=subseq_to_indices(subseq,word_to_index,max_len)
                indices_list.append(indices)
                id_list.append(id[i])
                if out is not None:
                    out_list.append(out[i])
    id_train=np.array(id_list)
    X_train=np.stack( indices_list, axis=0 )
    Y_train=None
    if out is not None:
        Y_train=np.array(out_list)
    return id_train,X_train,Y_train
    

In [7]:
# tokenize and create format
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` 
    
    Arguments:
    X -- array of sentences (strings), of shape (m,)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len))
    
    for i in range(m):                               # loop over training examples
        # Convert the ith training sentence to lower case and split it into words
        sentence_words= slice_sequence(X[i])
        # Initialize j to 0
        j = 0
        # Loop over the words of sentence_words
        for w in sentence_words:
            # if w exists in the word_to_index dictionary
            if w in word_to_index:
                # Set the (i,j)th entry of X_indices to the index of the correct word.
                X_indices[i, j] = word_to_index[w]
                j = j+1    
                if j>=max_len:
                    break
    return X_indices

In [8]:

#print(np.int32(X_train[0:5]))
#print(Y_train[0:5])
X_input=sentences_to_indices(seq,word_to_index,maxLen)
Y_input=out_oh
X_train=X_input[:-500]
Y_train=Y_input[:-500]
X_dev=X_input[-500:]
Y_dev=Y_input[-500:]
print(X_train.shape[0])
#print(X_train)
#print(Y_train)

19079


In [9]:
import tensorflow
np.random.seed(0)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.initializers import glorot_uniform
np.random.seed(1)

2024-03-15 08:46:42.607422: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-15 08:46:42.607579: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-15 08:46:42.766346: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_size = len(word_to_index) + 1              # adding 1 to fit Keras embedding (requirement)
    any_word = next(iter(word_to_vec_map.keys()))
    emb_dim = word_to_vec_map[any_word].shape[0]    # define dimensionality of your GloVe word vectors (= 50)
      
    # Initialize the embedding matrix as a numpy array of zeros.
    emb_matrix = np.zeros((vocab_size,emb_dim))
    
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(vocab_size,emb_dim)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,)) # Do not modify the "None".  This line of code is complete as-is.
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    embedding_layer.trainable = False

    return embedding_layer

In [11]:
# Model Authorize_s3

def Authorize_s3(input_shape,softmax_size, word_to_vec_map, word_to_index, LSTM_layers, LSTM_units):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)
    LSTM_layers, LSTM_units -- number of lstm layer and hidden unit
    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph.
    # It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
    sentence_indices = Input(shape=input_shape,dtype='int32')
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map,word_to_index)
    # Propagate sentence_indices through your embedding layer
    X = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences.
    for i in range(LSTM_layers):
        X = LSTM(units=LSTM_units,return_sequences=True)(X)
        # Add dropout with a probability of 0.5
        X = Dropout(0.4)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(units=LSTM_units,return_sequences=False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer 
    X = Dense(softmax_size)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(sentence_indices,X)
    model.layers[0].trainable = False

    return model

In [12]:
model1 = Authorize_s3((maxLen,),len(possible_output), word_to_vec_map, word_to_index,1,256)
model1.summary()
model1.compile(loss='categorical_crossentropy',
              optimizer=tensorflow.keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])

model2 = Authorize_s3((maxLen,),len(possible_output), word_to_vec_map, word_to_index,2,128)
model2.summary()
model2.compile(loss='categorical_crossentropy',
              optimizer=tensorflow.keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])
model3 = Authorize_s3((maxLen,),len(possible_output), word_to_vec_map, word_to_index,3,128)
model3.summary()
model3.compile(loss='categorical_crossentropy',
              optimizer=tensorflow.keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])

In [13]:
model1.fit(X_train, Y_train, epochs = 10, batch_size = 32, shuffle=True)
model1.save(f'/kaggle/working/modelv2.1_len{maxLen}_embed{embed}_ep10.h5')

model2.fit(X_train, Y_train, epochs = 10, batch_size = 32, shuffle=True)
model2.save(f'/kaggle/working/modelv2.2_len{maxLen}_embed{embed}_ep10.h5')

model3.fit(X_train, Y_train, epochs = 10, batch_size = 32, shuffle=True)
model3.save(f'/kaggle/working/modelv2.3_len{maxLen}_embed{embed}_ep10.h5')

print("ep 10 evaluate")
model1.evaluate(X_dev,Y_dev)
model2.evaluate(X_dev,Y_dev)
model3.evaluate(X_dev,Y_dev)

Epoch 1/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 312ms/step - accuracy: 0.4327 - loss: 1.0715
Epoch 2/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 313ms/step - accuracy: 0.6311 - loss: 0.8619
Epoch 3/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 312ms/step - accuracy: 0.7128 - loss: 0.7186
Epoch 4/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 312ms/step - accuracy: 0.7413 - loss: 0.6336
Epoch 5/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 311ms/step - accuracy: 0.7847 - loss: 0.5530
Epoch 6/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 312ms/step - accuracy: 0.8176 - loss: 0.4800
Epoch 7/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 313ms/step - accuracy: 0.8578 - loss: 0.3817
Epoch 8/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 314ms/step - accuracy: 0.8889 - loss: 0.3122
Epoch 9/

[0.6556474566459656, 0.7799999713897705]

In [14]:
model1.fit(X_train, Y_train, epochs = 10, batch_size = 32, shuffle=True)
model1.save(f'/kaggle/working/modelv2.1_len{maxLen}_embed{embed}_ep20.h5')

model2.fit(X_train, Y_train, epochs = 10, batch_size = 32, shuffle=True)
model2.save(f'/kaggle/working/modelv2.2_len{maxLen}_embed{embed}_ep20.h5')

model3.fit(X_train, Y_train, epochs = 10, batch_size = 32, shuffle=True)
model3.save(f'/kaggle/working/modelv2.3_len{maxLen}_embed{embed}_ep20.h5')

print("ep 20 evaluate")
model1.evaluate(X_dev,Y_dev)
model2.evaluate(X_dev,Y_dev)
model3.evaluate(X_dev,Y_dev)

Epoch 1/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 317ms/step - accuracy: 0.9535 - loss: 0.1590
Epoch 2/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 307ms/step - accuracy: 0.9552 - loss: 0.1606
Epoch 3/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 309ms/step - accuracy: 0.9570 - loss: 0.1430
Epoch 4/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 315ms/step - accuracy: 0.9701 - loss: 0.1058
Epoch 5/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 316ms/step - accuracy: 0.9757 - loss: 0.0868
Epoch 6/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 317ms/step - accuracy: 0.9776 - loss: 0.0826
Epoch 7/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 316ms/step - accuracy: 0.9818 - loss: 0.0686
Epoch 8/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 316ms/step - accuracy: 0.9827 - loss: 0.0639
Epoch 9/

[1.1143227815628052, 0.734000027179718]

In [15]:
model1.fit(X_train, Y_train, epochs = 10, batch_size = 32, shuffle=True)
model1.save(f'/kaggle/working/modelv2.1_len{maxLen}_embed{embed}_ep30.h5')

model2.fit(X_train, Y_train, epochs = 10, batch_size = 32, shuffle=True)
model2.save(f'/kaggle/working/modelv2.2_len{maxLen}_embed{embed}_ep30.h5')

model3.fit(X_train, Y_train, epochs = 10, batch_size = 32, shuffle=True)
model3.save(f'/kaggle/working/modelv2.3_len{maxLen}_embed{embed}_ep30.h5')

print("ep 30 evaluate")
model1.evaluate(X_dev,Y_dev)
model2.evaluate(X_dev,Y_dev)
model3.evaluate(X_dev,Y_dev)

Epoch 1/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 322ms/step - accuracy: 0.9870 - loss: 0.0500
Epoch 2/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 323ms/step - accuracy: 0.9889 - loss: 0.0426
Epoch 3/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 318ms/step - accuracy: 0.9875 - loss: 0.0475
Epoch 4/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 317ms/step - accuracy: 0.9863 - loss: 0.0476
Epoch 5/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 321ms/step - accuracy: 0.9842 - loss: 0.0522
Epoch 6/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 318ms/step - accuracy: 0.9866 - loss: 0.0429
Epoch 7/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 314ms/step - accuracy: 0.9874 - loss: 0.0443
Epoch 8/10
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 313ms/step - accuracy: 0.9917 - loss: 0.0312
Epoch 9/

[1.08786141872406, 0.765999972820282]