In [1]:

import re
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


def one_hot(array):
    unique, inverse = np.unique(array, return_inverse=True)
    onehot = np.eye(unique.shape[0])[inverse]
    return unique,onehot
def slice_sequence(sentence):
    return re.findall(r'"|\'|\w+|\S+', sentence.lower())
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/ise-competition-1/sample_submission/sample_submission.csv
/kaggle/input/ise-competition-1/test/test.csv
/kaggle/input/ise-competition-1/train/train.csv
/kaggle/input/glove6b/glove.6B.200d.txt
/kaggle/input/glove6b/glove.6B.50d.txt
/kaggle/input/glove6b/glove.6B.300d.txt
/kaggle/input/glove6b/glove.6B.100d.txt


In [2]:
maxLen=64
embed=200
epochs=20


In [3]:
train_csv=pd.read_csv("/kaggle/input/ise-competition-1/train/train.csv")
train=train_csv.to_numpy(dtype=str)
id = train[:,0]
seq = train[:,1]
out = train[:,2]
possible_output,out_oh=one_hot(out)
print(possible_output)


['EAP' 'HPL' 'MWS']


In [4]:
longest = max(seq, key=lambda x: len(re.findall(r'\w+|\S+', x.lower())))
shortest= min(seq, key=lambda x: len(re.findall(r'\w+|\S+', x.lower())))
#print (longest)
print(shortest)
print(slice_sequence(shortest))
#maxLen = len(re.findall(r'\w+|\S+', longest.lower()))
#maxLen = min(maxLen,200)

"PIQUANT EXPRESSIONS.
['"', 'piquant', 'expressions', '.']


In [5]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(f'/kaggle/input/glove6b/glove.6B.{embed}d.txt')

In [6]:
def subseq_to_indices(seq,word_to_index,max_len):
    indices=np.zeros(max_len)
    j=0
    for w in seq:
        # if w exists in the word_to_index dictionary
        if w in word_to_index:
        # Set the (i,j)th entry of X_indices to the index of the correct word.
            indices[j] = word_to_index[w]
            j = j+1    
    return indices
    
def process_data(id,seq,out,word_to_index,max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` 
    if sentence is longer than maxlength then it will be split up to multiple sequences
    """
    m = seq.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    #X_indices = np.zeros((0,max_len))
    id_list=[]
    indices_list=[]
    out_list=[]

    for i in range(m):    # loop over training examples
        # Convert the ith training sentence to lower case and split it into words
        sentence_words= slice_sequence(seq[i])
        slen=len(sentence_words)
        # Initialize j to 0
        j = 0
        # Loop over the words of sentence_words
        if slen<=max_len :
            indices=subseq_to_indices(sentence_words,word_to_index,max_len)
            indices_list.append(indices)
            id_list.append(id[i])
            if out is not None:
                out_list.append(out[i])
        else:
            count = slen // max_len
            last  = slen %  max_len
            for c in range(count):
                subseq=sentence_words[c*max_len:(c+1)*max_len]
                indices=subseq_to_indices(subseq,word_to_index,max_len)
                indices_list.append(indices)
                id_list.append(id[i])
                if out is not None:
                    out_list.append(out[i])
            if last!=0:
                subseq=sentence_words[-max_len:]
                indices=subseq_to_indices(subseq,word_to_index,max_len)
                indices_list.append(indices)
                id_list.append(id[i])
                if out is not None:
                    out_list.append(out[i])
    id_train=np.array(id_list)
    X_train=np.stack( indices_list, axis=0 )
    Y_train=None
    if out is not None:
        Y_train=np.array(out_list)
    return id_train,X_train,Y_train
    

In [7]:
# tokenize and create format
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` 
    
    Arguments:
    X -- array of sentences (strings), of shape (m,)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len))
    
    for i in range(m):                               # loop over training examples
        # Convert the ith training sentence to lower case and split it into words
        sentence_words= slice_sequence(X[i])
        # Initialize j to 0
        j = 0
        # Loop over the words of sentence_words
        for w in sentence_words:
            # if w exists in the word_to_index dictionary
            if w in word_to_index:
                # Set the (i,j)th entry of X_indices to the index of the correct word.
                X_indices[i, j] = word_to_index[w]
                j = j+1    
                if j>=max_len:
                    break
    return X_indices

In [8]:
def process_test():
    id_train,X_train,Y_train=process_data(id[0:1],
                                          seq[0:1],
                                          out_oh[0:1],
                                          word_to_index,
                                          20
                                         )
    print(id_train)
    print(seq[0:1])
    print(np.int32(X_train))

    X_train2=sentences_to_indices(seq, word_to_index, maxLen)
    print(np.int32(X_train2[0:1]))
    print(Y_train)
process_test();

#print(np.int32(X_train[0:5]))
#print(Y_train[0:5])
id_train,X_train,Y_train=process_data(id,
                                    seq,
                                    out_oh,
                                    word_to_index,
                                    maxLen)
print(X_train.shape[0])

['id26305' 'id26305' 'id26305']
['This process, however, afforded me no means of ascertaining the dimensions of my dungeon; as I might make its circuit, and return to the point whence I set out, without being aware of the fact; so perfectly uniform seemed the wall.']
[[358161 292849    452 182574    452  47561 239106 262351 239198 268047
   60830 357267 124214 268047 254259 130811  42950  60666 185458 243978]
 [231459 193920 101461    452  54719 307030 360916 357267 287480 386426
  185458 325900 272931    452 388757  72183  64318 268047 357267 143955]
 [287480 386426 185458 325900 272931    452 388757  72183  64318 268047
  357267 143955  42950 336115 281284 372015 323427 357267 382655    867]]
[[358161 292849    452 182574    452  47561 239106 262351 239198 268047
   60830 357267 124214 268047 254259 130811  42950  60666 185458 243978
  231459 193920 101461    452  54719 307030 360916 357267 287480 386426
  185458 325900 272931    452 388757  72183  64318 268047 357267 143955
   42950

In [9]:
import tensorflow
np.random.seed(0)
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, LSTM, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.initializers import glorot_uniform
np.random.seed(1)

2024-03-14 14:29:44.600917: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-14 14:29:44.601048: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-14 14:29:44.730133: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_size = len(word_to_index) + 1              # adding 1 to fit Keras embedding (requirement)
    any_word = next(iter(word_to_vec_map.keys()))
    emb_dim = word_to_vec_map[any_word].shape[0]    # define dimensionality of your GloVe word vectors (= 50)
      
    # Initialize the embedding matrix as a numpy array of zeros.
    emb_matrix = np.zeros((vocab_size,emb_dim))
    
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(vocab_size,emb_dim)

    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,)) # Do not modify the "None".  This line of code is complete as-is.
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    embedding_layer.trainable = False

    return embedding_layer

In [11]:
# Model Authorize_s3

def Authorize_s3(input_shape,softmax_size, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    # Define sentence_indices as the input of the graph.
    # It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
    sentence_indices = Input(shape=input_shape,dtype='int32')
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map,word_to_index)
    # Propagate sentence_indices through your embedding layer
    embeddings = embedding_layer(sentence_indices)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences.
    X = LSTM(units=256,return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(units=256,return_sequences=False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer 
    X = Dense(softmax_size)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(sentence_indices,X)
        
    return model

In [12]:
model = Authorize_s3((maxLen,),len(possible_output), word_to_vec_map, word_to_index)
model.layers[0].trainable = False
model.summary()

In [13]:
model.compile(loss='categorical_crossentropy',
              optimizer=tensorflow.keras.optimizers.Adam(learning_rate=0.0003),
              metrics=['accuracy'])

In [14]:
model.fit(X_train, Y_train, epochs = epochs, batch_size = 64, shuffle=True)

Epoch 1/20
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 366ms/step - accuracy: 0.5138 - loss: 0.9846
Epoch 2/20
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 369ms/step - accuracy: 0.6873 - loss: 0.7452
Epoch 3/20
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 367ms/step - accuracy: 0.7132 - loss: 0.6883
Epoch 4/20
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 376ms/step - accuracy: 0.7383 - loss: 0.6377
Epoch 5/20
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 374ms/step - accuracy: 0.7567 - loss: 0.5932
Epoch 6/20
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 372ms/step - accuracy: 0.7837 - loss: 0.5393
Epoch 7/20
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 371ms/step - accuracy: 0.7979 - loss: 0.5031
Epoch 8/20
[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 376ms/step - accuracy: 0.8196 - loss: 0.4562
Epoch 9/

<keras.src.callbacks.history.History at 0x7bb8e86ac880>

In [15]:
model.save(f'/kaggle/working/modelv2_len{maxLen}_embed{embed}_ep{epochs}.h5')


In [16]:
test_csv=pd.read_csv("/kaggle/input/ise-competition-1/test/test.csv")
test=test_csv.to_numpy(dtype=str)
id_test = test[:,0]
seq_test = test[:,1]
X_test=sentences_to_indices(seq_test, word_to_index, maxLen)
print(seq_test)

['Still, as I urged our leaving Ireland with such inquietude and impatience, my father thought it best to yield.'
 'If a fire wanted fanning, it could readily be fanned with a newspaper, and as the government grew weaker, I have no doubt that leather and iron acquired durability in proportion, for, in a very short time, there was not a pair of bellows in all Rotterdam that ever stood in need of a stitch or required the assistance of a hammer.'
 'And when they had broken down the frail door they found only this: two cleanly picked human skeletons on the earthen floor, and a number of singular beetles crawling in the shadowy corners.'
 ...
 'It is easily understood that what might improve a closely scrutinized detail, may at the same time injure a general or more distantly observed effect.'
 'Be this as it may, I now began to feel the inspiration of a burning hope, and at length nurtured in my secret thoughts a stern and desperate resolution that I would submit no longer to be enslaved.'

In [17]:
Y_test= model.predict(X_test)

[1m263/263[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 100ms/step


In [18]:
import csv
#np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
#print(Y_test)
eap_i=0
hpl_i=0
mws_i=0
for i in range(len(possible_output)):
    if possible_output[i]=="EAP":
        eap_i=i
    if possible_output[i]=="HPL":
        hpl_i=i
    if possible_output[i]=="MWS":
        mws_i=i
predict = []
for i in range(len(id_test)):
    predict.append(
        {
            'id' : id_test[i],
            'EAP': Y_test[i][eap_i],
            'HPL': Y_test[i][hpl_i],
            'MWS': Y_test[i][mws_i],
        }
    )
print(pd.DataFrame(predict))
pd.DataFrame(predict).to_csv("/kaggle/working/predict.csv",
                            index=False,
                            quotechar='"',
                            quoting=csv.QUOTE_NONNUMERIC)


           id       EAP       HPL       MWS
0     id02310  0.001079  0.000117  0.998804
1     id24541  0.923090  0.074267  0.002643
2     id00134  0.000034  0.999742  0.000224
3     id27757  0.000447  0.999180  0.000373
4     id04081  0.931684  0.064857  0.003459
...       ...       ...       ...       ...
8387  id11749  0.915955  0.051116  0.032929
8388  id10526  0.058773  0.003654  0.937574
8389  id13477  0.912830  0.083669  0.003500
8390  id13761  0.140360  0.804066  0.055574
8391  id04282  0.000054  0.999726  0.000220

[8392 rows x 4 columns]
