In [1]:
import pandas as pd
import re
contractions = pd.read_csv('./contractions.csv', index_col='Contraction')
contractions.index = contractions.index.str.lower()
contractions.Meaning = contractions.Meaning.str.lower()
contractions_dict = contractions.to_dict()['Meaning']

# Defining regex patterns.
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
hashtagPattern    = '#[^\s]+'
alphaPattern      = "[^a-z0-9<>]"
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"


def preprocess_apply(tweet):

    tweet = tweet.lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'<url>',tweet)
    # Replace @USERNAME to '<user>'.
    tweet = re.sub(userPattern,'<user>', tweet)
        # Replace 3 or more consecutive letters by 2 letter.
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    for contraction, replacement in contractions_dict.items():
        tweet = tweet.replace(contraction, replacement)

    # Remove non-alphanumeric and symbols
    tweet = re.sub(alphaPattern, ' ', tweet)

    # Adding space on either side of '/' to seperate words (After replacing URLS).
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

In [2]:
df = pd.read_csv('iclr_2019_sentence.csv')
df['processed_text'] = df.review.apply(preprocess_apply)
df

Unnamed: 0,review_id,review,processed_text
0,20193,This paper presents a new approach to learning...,this paper presents a new approach to learning...
1,20193,"The model, FAVAE, is based on the information ...",the model favae is based on the information ...
2,20193,The authors demonstrate that their approach is...,the authors demonstrate that their approach is...
3,20193,I also like the approach that the authors are ...,i also like the approach that the authors are ...
4,20193,"However, the paper could be improved by clarif...",however the paper could be improved by clarif...
...,...,...,...
35451,20194748,"2: Change ""Linear Discriminant"" to ""linear dis...",2 change linear discriminant to linear dis...
35452,20194748,"Also, remove--------the abbreviations (SVM and...",also remove the abbreviations svm and lda ...
35453,20194748,"5: Delete comma in ""assumption, that.""--------...",5 delete comma in assumption that p
35454,20194748,"8: ""nearly perfect"" -> ""nearly perfectly""-----...",8 nearly perfect > nearly perfectly the...


In [3]:
from gensim.models import Word2Vec

Word2vec_data = list(map(lambda x: x.split(), df['processed_text'].values))

# Defining the model and training it.
word2vec_model = Word2Vec(Word2vec_data,
                 vector_size = 300,
                 workers = 8
#                  min_count=5
                )

print("Vocabulary Length:", len(word2vec_model.wv.key_to_index))
final_data = pd.read_csv('iclr_2019_sentence.csv')


Vocabulary Length: 6712


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
#%%
tokenizer = Tokenizer(filters="", lower=False, oov_token="<oov>")
tokenizer.fit_on_texts(df['processed_text'].values)
VOCAB_LEN = len(word2vec_model.wv.key_to_index) + 500
tokenizer.num_words = VOCAB_LEN
print("Tokenizer vocab length:", tokenizer.num_words)
X_train = pad_sequences(tokenizer.texts_to_sequences(final_data.review.values), maxlen=768)
pd.DataFrame(X_train).to_csv('Tokennized_Processed-BiLSTM-2019.csv',index = False)


2025-02-21 15:26:22.017965: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Tokenizer vocab length: 7212


In [5]:
LOAD_PATH = './Tokennized_Processed-BiLSTM-2019.csv'
custom_val_embeds = pd.read_csv(LOAD_PATH)

In [6]:
import tensorflow as tf
from keras.layers import Bidirectional, Input, Dense, Layer, Dropout, LSTM, Embedding, Flatten
from keras.models import Sequential, Model
from tensorflow.python.keras.callbacks import EarlyStopping
from keras import backend as K

In [7]:
class Attention(Layer):

    def __init__(self, return_sequences=True, **kwargs):
        super(Attention, self).__init__()
        self.return_sequences = return_sequences
        super(Attention, self).__init__(**kwargs)

    def get_config(self):
        config = super(Attention, self).get_config().copy()
        config.update({
            'return_sequences': self.return_sequences,
        })
        return config

    def build(self, input_shape):
        self.W = self.add_weight(name="att_weight", shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="att_bias", shape=(input_shape[1], 1),
                                 initializer="zeros")

        super(Attention, self).build(input_shape)

    def call(self, x):
        e = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
        a = K.softmax(e)
        a = K.expand_dims(a, axis=-1)
        output = x * a

        return K.sum(output, axis=1)

In [8]:
from keras.models import load_model
import numpy as np

def loadModel(name, PATH, X):
    model = load_model(PATH, custom_objects={'Attention': Attention})
    print(name + " MODEL LOADED\n\n")

    return model


PATH = './Politeness_Custom-Embedding-BiLSTM.h5'
custom_model = loadModel('Custom Embed', PATH, custom_val_embeds)

2025-02-21 15:26:54.801594: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-02-21 15:26:55.254911: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22302 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:a0:00.0, compute capability: 8.6


Custom Embed MODEL LOADED




In [9]:
def adjustIndex(arr):
    return [x+1 for x in arr]
#%%

y_pred_Custom = custom_model.predict(custom_val_embeds)

y_pred_Custom_idx = adjustIndex(np.argmax(y_pred_Custom, axis=1))

2025-02-21 15:27:01.699859: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8101


   6/1108 [..............................] - ETA: 33s 

2025-02-21 15:27:02.452939: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.




In [10]:
#%%
# CONCATENATE RESULTS
results = pd.DataFrame()
results['review_id'] = df.review_id
results['reviews'] = df.review
results['politeness'] = y_pred_Custom_idx
results.to_csv('iclr_2019_politeness.csv', index=False)