In [1]:
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from string import punctuation
from collections import Counter

from typing import List, Dict, Callable

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Flatten

warnings.filterwarnings('ignore')

# Data preparation

In [2]:
data = pd.read_csv('quora.csv')
data

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0


In [3]:
def preprocess(text: str) -> List[str]:
    tokens = text.lower().split()
    tokens = [token.strip(punctuation) for token in tokens]
    return [token for token in tokens if token]

In [4]:
preprocessed = [preprocess(text) for text in tqdm(data.question_text)]

100%|██████████| 1306122/1306122 [00:10<00:00, 124857.25it/s]


In [5]:
MAX_LEN = max(len(tokens) for tokens in preprocessed)
MAX_LEN

132

In [6]:
vocab = Counter()

for tokens in preprocessed:
    vocab.update(tokens)

len(vocab)

273055

In [7]:
filtered_vocab = {word for word, count in vocab.items() if count >= 50}
len(filtered_vocab)

14256

In [8]:
word2id = {'<PAD>': 0, '<UNK>': 1}

for word in filtered_vocab:
    word2id[word] = len(word2id)

In [9]:
X = []
filtered = []

for tokens in preprocessed:
    ids = [word2id.get(token, 1) for token in tokens]
    filtered.append([token if token in filtered_vocab else '<UNK>' for token in tokens])
    X.append(ids)

In [38]:
dummy = lambda x: x
tfidf = TfidfVectorizer(preprocessor=dummy, tokenizer=dummy, lowercase=False, vocabulary=word2id)
tfidf.fit(filtered)

tfidf_dict = {token: tfidf.idf_[idx] for token, idx in word2id.items()}
X_tfidf = [np.array([tfidf_dict[token] for token in tokens]) for tokens in tqdm(filtered)]
X_tfidf = np.expand_dims(pad_sequences(X_tfidf, maxlen=MAX_LEN), axis=-1)
X_tfidf.shape

100%|██████████| 1306122/1306122 [00:10<00:00, 130011.26it/s][A


(1306122, 132, 1)

In [11]:
X = pad_sequences(X, maxlen=MAX_LEN)
X.shape

(1306122, 132)

In [12]:
y = data.target.values
y.shape

(1306122,)

In [39]:
X_train, X_valid, \
X_tfidf_train, X_tfidf_valid, \
y_train, y_valid = train_test_split(X, X_tfidf, y, test_size=0.05, stratify=y, random_state=420)

# Sentence embedding experiments

In [14]:
def concatenate(embeddings, weights = None):
    return Flatten()(embeddings)

In [15]:
def average(embeddings, weights = None):
    return tf.math.reduce_mean(embeddings, axis=1)

In [16]:
def add(embeddings, weights = None):
    return tf.math.reduce_sum(embeddings, axis=1)

In [17]:
def weighted_average(embeddings, weights):
    return tf.squeeze(tf.matmul(embeddings, weights, transpose_a=True), axis=-1)

In [35]:
def build_model(embedding_dim: int = 100, pooling_fn: Callable = weighted_average,
                hidden: int = 64, dropout_rate: float = 0.1, l2_rate: float = 1e-4,
                output_dim: int = 1, lr: float = 1e-3):
    inputs = Input(shape=(MAX_LEN,))
    weights = Input(shape=(MAX_LEN, 1))
    
    embeddings = Embedding(input_dim=len(word2id), output_dim=embedding_dim)(inputs)
    embeddings = Dropout(dropout_rate)(embeddings)
    pool = pooling_fn(embeddings, weights)
    
    dense = Dense(hidden, activation='relu', kernel_regularizer=regularizers.l2(l2_rate))(pool)
    dense = Dropout(dropout_rate)(dense)
    
    activation = 'sigmoid' if output_dim == 1 else 'softmax'
    outputs = Dense(output_dim, activation=activation, kernel_regularizer=regularizers.l2(l2_rate))(dense)

    model = Model(inputs=[inputs, weights], outputs=outputs)
    optimizer = Adam(learning_rate=lr)
    loss = 'binary_crossentropy' if output_dim == 1 else 'categorical_crossentropy'
    
    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=['accuracy'])
    
    return model

In [19]:
callbacks = [EarlyStopping(patience=2)]

## 1. Concatenation

In [20]:
model = build_model(embedding_dim=100, pooling_fn=concatenate, hidden=128, lr=0.0025)

model.fit([X_train, X_tfidf_train], y_train, 
          validation_data=([X_valid, X_tfidf_valid], y_valid),
          batch_size=4096,
          callbacks=callbacks,
          epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50


<tensorflow.python.keras.callbacks.History at 0x7f33474b37b8>

In [21]:
preds = model.predict([X_valid, X_tfidf_valid]).reshape(-1)
print(classification_report(y_valid, (preds > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97     61266
           1       0.65      0.52      0.58      4041

    accuracy                           0.95     65307
   macro avg       0.81      0.75      0.78     65307
weighted avg       0.95      0.95      0.95     65307



## 2. Averaging

In [22]:
model = build_model(embedding_dim=100, pooling_fn=average, hidden=64, lr=0.01)

model.fit([X_train, X_tfidf_train], y_train,
          validation_data=([X_valid, X_tfidf_valid], y_valid),
          batch_size=16384,
          callbacks=callbacks,
          epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


<tensorflow.python.keras.callbacks.History at 0x7f32782f80f0>

In [23]:
preds = model.predict([X_valid, X_tfidf_valid]).reshape(-1)
print(classification_report(y_valid, (preds > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98     61266
           1       0.68      0.52      0.59      4041

    accuracy                           0.96     65307
   macro avg       0.82      0.75      0.78     65307
weighted avg       0.95      0.96      0.95     65307



## 3. Summation

In [24]:
model = build_model(embedding_dim=100, pooling_fn=add, hidden=64, lr=0.01)

model.fit([X_train, X_tfidf_train], y_train,
          validation_data=([X_valid, X_tfidf_valid], y_valid),
          batch_size=16384,
          callbacks=callbacks,
          epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


<tensorflow.python.keras.callbacks.History at 0x7f327817df28>

In [25]:
preds = model.predict([X_valid, X_tfidf_valid]).reshape(-1)
print(classification_report(y_valid, (preds > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98     61266
           1       0.68      0.51      0.58      4041

    accuracy                           0.95     65307
   macro avg       0.82      0.75      0.78     65307
weighted avg       0.95      0.95      0.95     65307



## 4. Extra: tf-idf averaging

In [42]:
model = build_model(embedding_dim=100, hidden=64, dropout_rate=0.2, lr=0.01)

model.fit([X_train, X_tfidf_train], y_train,
          validation_data=([X_valid, X_tfidf_valid], y_valid),
          batch_size=16384,
          callbacks=callbacks,
          epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


<tensorflow.python.keras.callbacks.History at 0x7f321c2ba6d8>

In [43]:
preds = model.predict([X_valid, X_tfidf_valid]).reshape(-1)
print(classification_report(y_valid, (preds > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98     61266
           1       0.64      0.57      0.60      4041

    accuracy                           0.95     65307
   macro avg       0.80      0.77      0.79     65307
weighted avg       0.95      0.95      0.95     65307



Using tf-idf averaging allows us to gain an extra point in the f1-score with macro averaging, which is often the most suitable metric for an unbalanced classification task. However, the gain is not significant enough compared to normal averaging. Moreover, we lose one point in accuracy. Overall, there is no need to do the extra work here since unweighted averaging produces a result that is almost identical to this one.