In [1]:
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
from string import punctuation
from collections import Counter

from typing import List, Dict, Callable

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import gensim
import gensim.downloader as api

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Concatenate, Average

warnings.filterwarnings('ignore')

# Data preparation

In [2]:
data = pd.read_csv('quora.csv')
data

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0


In [3]:
def preprocess(text: str) -> List[str]:
    tokens = text.lower().split()
    tokens = [token.strip(punctuation) for token in tokens]
    return [token for token in tokens if token]

In [4]:
preprocessed = [preprocess(text) for text in tqdm(data.question_text)]

100%|██████████| 1306122/1306122 [00:16<00:00, 80362.87it/s] 


In [5]:
MAX_LEN = max(len(tokens) for tokens in preprocessed)
MAX_LEN

132

In [6]:
vocab = Counter()

for tokens in preprocessed:
    vocab.update(tokens)

len(vocab)

273055

In [7]:
filtered_vocab = {word for word, count in vocab.items() if count >= 50}
len(filtered_vocab)

14256

In [8]:
word2id = {'<PAD>': 0, '<UNK>': 1}

for word in filtered_vocab:
    word2id[word] = len(word2id)

In [9]:
X = []

for tokens in preprocessed:
    ids = [word2id.get(token, 1) for token in tokens]
    X.append(ids)

In [10]:
X = pad_sequences(X, maxlen=MAX_LEN)
X.shape

(1306122, 132)

In [11]:
y = data.target.values
y.shape

(1306122,)

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, stratify=y, random_state=42)

In [13]:
ft_matrix = np.zeros((len(word2id), 300))
ft = api.load('fasttext-wiki-news-subwords-300')

glove_matrix = np.zeros((len(word2id), 300))
glove = api.load('glove-wiki-gigaword-300')

ft_unk_indices = []
glove_unk_indices = []

for word, idx in tqdm(word2id.items()):
    if word == '<PAD>':
        continue
    
    if word == '<UNK>':
        ft_unk_indices.append(idx)
        glove_unk_indices.append(idx)
        
    try:
        ft_matrix[idx] = ft[word]
    except KeyError:
        ft_unk_indices.append(idx)
    
    try:
        glove_matrix[idx] = glove[word]
    except KeyError:
        glove_unk_indices.append(idx)

ft_mean_vector = np.mean(np.vstack([vector for i, vector in enumerate(ft_matrix) if i not in ft_unk_indices]), axis=0)
for idx in ft_unk_indices:
    ft_matrix[idx] = ft_mean_vector
        
glove_mean_vector = np.mean(np.vstack([vector for i, vector in enumerate(glove_matrix) if i not in glove_unk_indices]), axis=0)
for idx in glove_unk_indices:
    glove_matrix[idx] = glove_mean_vector



100%|██████████| 14258/14258 [00:00<00:00, 38539.19it/s]


# Sentence embedding experiments

In [14]:
def average(lst):
    return Average()(lst)

In [15]:
def concatenate(lst):
    return Concatenate()(lst)

In [16]:
def build_model(embedding_dim: int = 100, pooling_fn: Callable = average,
                hidden: int = 64, dropout_rate: float = 0.1, l2_rate: float = 1e-4,
                output_dim: int = 1, lr: float = 1e-3):
    inputs = Input(shape=(MAX_LEN,))
    
    embeddings1 = Embedding(input_dim=len(word2id), output_dim=embedding_dim, weights=[ft_matrix], trainable=False)(inputs)
    embeddings2 = Embedding(input_dim=len(word2id), output_dim=embedding_dim, weights=[glove_matrix], trainable=False)(inputs)
    
    embeddings = pooling_fn([embeddings1, embeddings2])
    pool = tf.math.reduce_mean(embeddings, axis=1)
    pool = Dropout(dropout_rate)(pool)
    
    dense = Dense(hidden, activation='relu', kernel_regularizer=regularizers.l2(l2_rate))(pool)
    dense = Dropout(dropout_rate)(dense)
    
    activation = 'sigmoid' if output_dim == 1 else 'softmax'
    outputs = Dense(output_dim, activation=activation, kernel_regularizer=regularizers.l2(l2_rate))(dense)

    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(learning_rate=lr)
    loss = 'binary_crossentropy' if output_dim == 1 else 'categorical_crossentropy'
    
    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=['accuracy'])
    
    return model

In [17]:
callbacks = [EarlyStopping(patience=2)]

## 1. Concatenation

In [18]:
model = build_model(embedding_dim=300, pooling_fn=concatenate, hidden=256, lr=0.005, dropout_rate=0.2)

model.fit(X_train, y_train,
          validation_data=(X_valid, y_valid),
          batch_size=8192,
          callbacks=callbacks,
          epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


<tensorflow.python.keras.callbacks.History at 0x7f6a885fc4e0>

In [19]:
preds = model.predict(X_valid).reshape(-1)
print(classification_report(y_valid, (preds > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     61266
           1       0.71      0.20      0.31      4041

    accuracy                           0.95     65307
   macro avg       0.83      0.60      0.64     65307
weighted avg       0.93      0.95      0.93     65307



## 2. Averaging

In [20]:
model = build_model(embedding_dim=300, hidden=256, lr=0.005, dropout_rate=0.2)

model.fit(X_train, y_train,
          validation_data=(X_valid, y_valid),
          batch_size=8192,
          callbacks=callbacks,
          epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


<tensorflow.python.keras.callbacks.History at 0x7f6a80581320>

In [21]:
preds = model.predict(X_valid).reshape(-1)
print(classification_report(y_valid, (preds > 0.5).astype(int)))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97     61266
           1       0.71      0.16      0.26      4041

    accuracy                           0.94     65307
   macro avg       0.83      0.58      0.61     65307
weighted avg       0.93      0.94      0.93     65307



The concatenation of embeddings works better which is in agreement with our expectations: after all, different word vectors will have different dimensions of meaning, therefore, averaging them might not be a good idea.