## Quora Insincere Questions Classification

### Description

##### We will be predicting whether a question asked on Quora is sincere or not.

An insincere question is defined as a question intended to make a statement rather than look for helpful answers.

In [18]:
gpus = tensorflow.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tensorflow.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tensorflow.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tensorflow.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [19]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import sequential
from keras import initializers, regularizers, constraints, optimizers, layers

In [20]:
train_df = pd.read_csv('C:\\Users\\maryam\\Desktop\\Python Program\\Deep Learning\\2\\train.csv')
test_df = pd.read_csv('C:\\Users\\maryam\\Desktop\\Python Program\\Deep Learning\\2\\test.csv')
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


Next steps are as follows:

1) Split the training dataset into train and val sample. Cross validation is a time consuming process and so let us do simple train val split.

2) Fill up the missing values in the text column with 'na'

3) Tokenize the text column and convert them to vector sequences

4) Pad the sequence as needed - if the number of words in the text is greater than 'max_len' trunacate them to 'max_len' or if the number of words in the text is lesser than 'max_len' add zeros for remaining values.

##### split to train and val

In [21]:
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

some config values

In [22]:
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

#### fill up the missing values

In [23]:

train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values


In [24]:
train_df.head(2)

Unnamed: 0,qid,question_text,target
651064,7f8590ef60e30b4344fd,What have been the best exhibits at the Museo ...,0
1294259,fda9538a2e0a5b2dfc3c,How can I rotate batch image files?,0


In [25]:
train_X

array(['What have been the best exhibits at the Museo del Prado in Madrid?',
       'How can I rotate batch image files?',
       'Which is the best cable operator in Thane west area?', ...,
       'Do we need a prescription for cough syrup in Egypt?',
       'What are the best and worst aspects of being a travel agent?',
       'Who was a person you met who gave a very good vibe/ good-spirit that you remained friends with through life?'],
      dtype=object)

#### Tokenize the sentences

In [26]:

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

#### Pad the sentences 

In [27]:
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)


In [28]:
train_X.shape

(1175509, 100)

### Get the target values

In [29]:
train_y = train_df['target'].values
val_y = val_df['target'].values

### Without Pretrained Embeddings:



Now that we are done with all the necessary preprocessing steps, we can first train a Bidirectional GRU model. 
We will not use any pre-trained word embeddings for this model and the embeddings will be learnt from scratch. 
Please check out the model summary for the details of the layers used.

In [30]:
import tensorflow
from tensorflow.core.protobuf import rewriter_config_pb2
from keras.backend import set_session

config_proto = tensorflow.compat.v1.ConfigProto()
off = rewriter_config_pb2.RewriterConfig.OFF
config_proto.graph_options.rewrite_options.arithmetic_optimization = off
session = tensorflow.compat.v1.Session(config=config_proto)
tensorflow.compat.v1.keras.backend.set_session(session)

In [31]:
print(tensorflow)

<module 'tensorflow' from 'C:\\Users\\maryam\\Anaconda3\\envs\\tf_gpu\\lib\\site-packages\\tensorflow\\__init__.py'>


In [32]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(GRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_4 (Bidirection (None, 100, 128)          140160    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 17  

Train the model using train sample and monitor the metric on the valid sample. This is just a sample model running for 2 epochs. 

#### Train the model 


In [33]:
model.fit(train_X, train_y, batch_size=512, epochs=2, validation_data=(val_X, val_y))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x10e02bf1e88>

Now let us get the validation sample predictions and also get the best threshold for F1 score.

In [34]:
pred_noemb_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(val_y, (pred_noemb_val_y>thresh).astype(int))))

F1 score at threshold 0.1 is 0.548190504852908
F1 score at threshold 0.11 is 0.5582149474984558
F1 score at threshold 0.12 is 0.5674554576699719
F1 score at threshold 0.13 is 0.5762164344841512
F1 score at threshold 0.14 is 0.583508295253386
F1 score at threshold 0.15 is 0.5900125891733109
F1 score at threshold 0.16 is 0.5964192624877152
F1 score at threshold 0.17 is 0.601500498720673
F1 score at threshold 0.18 is 0.607649599012955
F1 score at threshold 0.19 is 0.6116604769788357
F1 score at threshold 0.2 is 0.6158245948522403
F1 score at threshold 0.21 is 0.6205213226489822
F1 score at threshold 0.22 is 0.6249591789129928
F1 score at threshold 0.23 is 0.6284173682716867
F1 score at threshold 0.24 is 0.6322958131504484
F1 score at threshold 0.25 is 0.6352713178294574
F1 score at threshold 0.26 is 0.6373303278286861
F1 score at threshold 0.27 is 0.6407102469993056
F1 score at threshold 0.28 is 0.6427890644594934
F1 score at threshold 0.29 is 0.6449620253164556
F1 score at threshold 0.3 

Now let us get the test set predictions as well and save them

In [35]:
pred_noemb_test_y = model.predict([test_X], batch_size=1024, verbose=1)



Now that our model building is done, it might be a good idea to clean up some memory before we go to the next step.

In [36]:
del model, inp, x
import gc; gc.collect()
time.sleep(10)

So we got some baseline GRU model without pre-trained embeddings. Now let us use the provided embeddings and rebuild the model again to see the performance.

#### Glove Embeddings:

In this section, let us use the Glove embeddings and rebuild the GRU model.

#### Embedding

Embedding setup

In [37]:
embeddings_index = {}
f = open('C:\\Users\\maryam\\Desktop\\Python Program\\Deep Learning\\2\\glove.840B.300d\\glove.840B.300d.txt',encoding="utf-8")
for line in tqdm(f):
    values = line.split(" ") # Return a list of the words in the string
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [16:03, 2278.35it/s]


Found 2196016 word vectors.


#### Convert values to Embeddings

In [38]:
def text_to_array(text):
    empyt_emb = np.zeros(300)
    text = text[:-1].split()[:30]
    embeds = [embeddings_index.get(x, empyt_emb) for x in text]
    embeds+= [empyt_emb] * (30 - len(embeds))
    return np.array(embeds)

# train_vects = [text_to_array(X_text) for X_text in tqdm(train_df["question_text"])]
val_vects = np.array([text_to_array(X_text) for X_text in tqdm(val_df["question_text"][:3000])])
val_y = np.array(val_df["target"][:3000])

100%|██████████████████████████████████████████████████████████████████████████████| 3000/3000 [04:44<00:00, 10.56it/s]


#### Data Providers

In [39]:
batch_size = 128

def batch_gen(train_df):
    n_batches = math.ceil(len(train_df) / batch_size) # Return the ceiling of x as an Integral
    while True: 
        train_df = train_df.sample(frac=1.)  # Shuffle the data.
        for i in range(n_batches):
            texts = train_df.iloc[i*batch_size:(i+1)*batch_size, 1]
            text_arr = np.array([text_to_array(text) for text in texts])
            yield text_arr, np.array(train_df["target"][i*batch_size:(i+1)*batch_size])

#### Trainning

In [42]:
from keras.models import Sequential

In [43]:
model = Sequential()
model.add(Bidirectional(GRU(64, return_sequences=True),input_shape=(30, 300)))
model.add(Bidirectional(GRU(64)))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [44]:
mg = batch_gen(train_df)
model.fit_generator(mg, epochs=20,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x10f8134ecc8>

#### Prediction

In [45]:
batch_size = 256
def batch_gen(test_df):
    n_batches = math.ceil(len(test_df) / batch_size)
    for i in range(n_batches):
        texts = test_df.iloc[i*batch_size:(i+1)*batch_size, 1]
        text_arr = np.array([text_to_array(text) for text in texts])
        yield text_arr


all_preds = []
for x in tqdm(batch_gen(test_df)):
    all_preds.extend(model.predict(x).flatten())

1468it [15:54,  1.54it/s]


#### Observation

Pretrained embeddings seem to give better results comapred to non-pretrained model.