In [None]:
!wget -qq https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -O archive.tar.gz
!tar -xf  'archive.tar.gz' -C .
!pip -qq install eli5 tensorflow_addons

In [2]:
import warnings
warnings.filterwarnings("ignore")

import re, os
from tqdm.notebook import tqdm
from functools import partial

import pandas as pd
import numpy as np

import spacy
from spacy import displacy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

import nltk
from nltk.corpus import stopwords

import tensorflow as tf

pd.set_option("display.max_colwidth", None)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
nlp = spacy.load("en_core_web_sm", disable=['parser'])

In [4]:
def create_dataset(path="aclImdb"):
    texts, labels = [], []
    label_dict = {"pos": 1, "neg": 0}
    tq = tqdm(total=50000)
    for sub in ("train", "test"):
        for sent in ("pos", "neg"):
            root = os.path.join(path, sub, sent)
            for f in sorted(os.listdir(root)):
                filepath = os.path.join(root, f)
                with open(filepath, "r", encoding="utf-8") as infile:
                    text = infile.read()
                texts.append(text)
                labels.append(label_dict[sent])
                tq.update()
    return pd.DataFrame({"text": texts, "label": labels})

In [5]:
df = create_dataset()

  0%|          | 0/50000 [00:00<?, ?it/s]

In [23]:
df.head(1)

Unnamed: 0,text,label
0,"Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as ""Teachers"". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is ""Teachers"". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!",1


In [6]:
stop = nltk.corpus.stopwords.words('english')

In [7]:
def preprocessor(text, lemma=False, stopw=False, pos=False):
    if pos:
        doc = nlp(text)
        text = " ".join([token.text if token.ent_iob_ == "O" else token.ent_type_ for token in doc])

    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)

    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    if stopw:
        text = " ".join([token for token in text.split() if token not in stop])

    if lemma:
        doc = nlp(text)
        text = " ".join([token.lemma_ for token in doc])

    return text

In [91]:
x_train, x_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, shuffle=True, stratify=df["label"], random_state=42)

In [92]:
pipe = make_pipeline(TfidfVectorizer(ngram_range=(1, 2), max_features=5000, preprocessor=partial(preprocessor, lemma=False, stopw=True)),
                     LogisticRegression(random_state=42, n_jobs=-1)).fit(x_train, y_train)

print(classification_report(y_train, pipe.predict(x_train)))
print(classification_report(y_test, pipe.predict(x_test)))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91     20000
           1       0.91      0.93      0.92     20000

    accuracy                           0.92     40000
   macro avg       0.92      0.92      0.92     40000
weighted avg       0.92      0.92      0.92     40000

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [93]:
pipe = make_pipeline(TfidfVectorizer(ngram_range=(1, 2), max_features=5000, preprocessor=partial(preprocessor, lemma=True, stopw=True)),
                     LogisticRegression(random_state=42, n_jobs=-1)).fit(x_train, y_train)

print(classification_report(y_train, pipe.predict(x_train)))
print(classification_report(y_test, pipe.predict(x_test)))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91     20000
           1       0.90      0.92      0.91     20000

    accuracy                           0.91     40000
   macro avg       0.91      0.91      0.91     40000
weighted avg       0.91      0.91      0.91     40000

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [98]:
import eli5
eli5.show_weights(pipe[1], vec=pipe[0], top=24)

Weight?,Feature
+6.915,excellent
+6.861,great
+5.326,amazing
+5.257,perfect
+4.921,wonderful
+4.374,brilliant
+4.356,hilarious
+4.266,favorite
… 2572 more positive …,… 2572 more positive …
… 2405 more negative …,… 2405 more negative …


In [102]:
print('Positive' if y_test.iloc[1] else 'Negative')
eli5.show_prediction(pipe[1], x_test.iloc[1], vec=pipe[0], targets=['positive'], target_names=['negative', 'positive'])

Negative


Contribution?,Feature
-0.002,<BIAS>
-2.525,Highlighted in text (sum)


In [None]:
# replace common words with POS and no lemma or punkt removal
# wordpiece: https://keras.io/api/keras_nlp/tokenizers/compute_word_piece_vocabulary/
# build transformer model: emb+sin_pos_emb+mha+ffn+cls_head

In [103]:
df["text_pos"] = df["text"].apply(partial(preprocessor, lemma=False, stopw=False, pos=True))

In [107]:
df.to_csv("prep_df.csv", index=False)

In [None]:
# !wget "https://docs.google.com/uc?export=download&id=1Qqu4Z9dtxtO9HezyA-7eVf77myyyo7Ya" -O prep.csv

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
prep = pd.read_csv("/content/drive/My Drive/prep_df.csv")
prep.head(1)

Unnamed: 0,text,label,text_pos
0,"Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as ""Teachers"". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is ""Teachers"". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!",1,org org is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my date date in the teaching profession lead me to believe that org org org satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i m here to sack cardinal of your teachers student welcome to org high i expect that many adults of my age think that org org is far fetched what a pity that it is n t


In [145]:
prep["text_pos"].str.len().describe()

count    50000.000000
mean      1261.572580
std        950.496445
min         31.000000
25%        678.000000
50%        937.000000
75%       1533.000000
max      13183.000000
Name: text_pos, dtype: float64

In [10]:
from tensorflow_addons.optimizers import AdamW

In [11]:
x_train, x_test, y_train, y_test = train_test_split(prep["text_pos"], prep["label"], test_size=0.2, shuffle=True, stratify=prep["label"], random_state=42)

In [25]:
max_features = 20000
sequence_length = 512

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=None,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer.adapt(x_train.values)

In [13]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    text = vectorize_layer(text)
    mask = tf.cast(tf.not_equal(text, 0), dtype=tf.bool)
    mask = tf.expand_dims(mask, axis=-2)
    return (text, mask), label

In [14]:
def squeeze(x,z):
    """ dimensions issues after applying batch at the end: input is 1d array """
    return (tf.squeeze(x[0], axis=1), tf.squeeze(x[1], axis=1)), z

In [26]:
train_ds = tf.data.Dataset.from_tensor_slices((x_train.values, y_train)) \
                          .map(vectorize_text) \
                          .cache() \
                          .shuffle(len(x_train)) \
                          .prefetch(buffer_size=tf.data.AUTOTUNE) \
                          .batch(64) \
                          .map(squeeze)

test_ds = tf.data.Dataset.from_tensor_slices((x_test.values, y_test)) \
                          .map(vectorize_text) \
                          .cache() \
                          .batch(64) \
                          .prefetch(buffer_size=tf.data.AUTOTUNE) \
                          .map(squeeze)

In [16]:
def transformer_encoder(inputs, mask, head_size, num_heads, ff_dim, dropout=0.1):
    x = tf.keras.layers.MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
            )(inputs, inputs, attention_mask=mask)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Add()([x, inputs])
    res = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)

    x = tf.keras.layers.Dense(units=ff_dim, activation="gelu")(res)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Dense(units=inputs.shape[-1])(x)

    x = tf.keras.layers.Add()([x, res])
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)

    return x

In [17]:
class SeqEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super(SeqEmbedding, self).__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) 
        self.pos_encoding = self.positional_encoding(length=1024, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def positional_encoding(self, length, depth):
        depth = depth / 2

        positions = np.arange(length)[:, np.newaxis]     
        depths = np.arange(depth)[np.newaxis, :] / depth  

        angle_rates = 1 / (10000**depths)         
        angle_rads = positions * angle_rates      

        pos_encoding = np.concatenate(
            [np.sin(angle_rads), np.cos(angle_rads)], axis=-1
            ) 

        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)

        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x

In [18]:
class MeanPooling(tf.keras.layers.Layer):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def call(self, inputs, masks):
        masks = tf.squeeze(masks, axis=1)
        expanded_masks = tf.broadcast_to(masks[..., None], shape=tf.shape(inputs))
        expanded_masks = tf.cast(expanded_masks, dtype=tf.float32)
        sum_embeddings = tf.reduce_sum(inputs * expanded_masks, axis=1)
        sum_masks = tf.reduce_sum(expanded_masks, axis=1)
        sum_masks = tf.clip_by_value(sum_masks, clip_value_min=1e-9, clip_value_max=tf.reduce_max(sum_masks))
        mean_embeddings = sum_embeddings / sum_masks
        return mean_embeddings

In [38]:
def build_model(
                input_shape,
                num_heads=4,
                num_transformer_blocks=4,
                mlp_units=[64,],
                n_classes=1,
                emb_dim=128,
                dropout=0.1,
                mlp_dropout=0.1,
):
    head_size = emb_dim // num_heads
    ff_dim = emb_dim * 4

    inputs = tf.keras.Input(shape=input_shape, name="input")
    mask = tf.keras.Input(shape=(1, sequence_length), name="mask") 

    x = inputs
    x = SeqEmbedding(max_features+1, emb_dim)(x)  

    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, mask, head_size, num_heads, ff_dim, dropout)  

    x = MeanPooling()(x, mask)

    for dim in mlp_units:
        x = tf.keras.layers.Dense(dim, activation="relu")(x)
        x = tf.keras.layers.Dropout(mlp_dropout)(x)

    outputs = tf.keras.layers.Dense(n_classes)(x)
    return tf.keras.Model([inputs, mask], outputs, name="Transformer")

In [39]:
model = build_model(input_shape=(sequence_length,), mlp_dropout=0.2, num_transformer_blocks=2)

In [40]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=AdamW(learning_rate=0.001, weight_decay=0.001),
              metrics=tf.metrics.BinaryAccuracy(threshold=0.5))

callbacks = [
             tf.keras.callbacks.ReduceLROnPlateau(patience=2),
             tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
            ]

epochs = 10
history = model.fit(train_ds, validation_data=test_ds, epochs=epochs, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [41]:
y_pred = model.predict(test_ds)



In [42]:
y_pred = tf.nn.sigmoid(y_pred)

In [35]:
from sklearn.metrics import precision_recall_curve, accuracy_score

In [36]:
def get_best_threshold(y_true, y_score):
    precision, recall, thresholds = precision_recall_curve(y_true, y_score)
    # fscore = 2 * precision * recall / (precision + recall)
    accs = []
    for thresh in thresholds:
        accs.append(accuracy_score(y_true, y_score >= thresh))
    ix = np.argmax(accs)
    return thresholds[ix], accs[ix]

In [43]:
get_best_threshold(y_test, y_pred)

(0.64915395, 0.8962)

In [44]:
pipe = make_pipeline(TfidfVectorizer(ngram_range=(1, 2), max_features=5000, preprocessor=partial(preprocessor, lemma=False, stopw=True)),
                     LogisticRegression(random_state=42, n_jobs=-1)).fit(x_train, y_train)

print(classification_report(y_train, pipe.predict(x_train)))
print(classification_report(y_test, pipe.predict(x_test)))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91     20000
           1       0.90      0.92      0.91     20000

    accuracy                           0.91     40000
   macro avg       0.91      0.91      0.91     40000
weighted avg       0.91      0.91      0.91     40000

              precision    recall  f1-score   support

           0       0.90      0.87      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

