In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
   
import tensorflow as tf
from tensorflow import keras

import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



import os
print(tf.__version__)
 

2.0.0


# Loading and pre-preparation data:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! ls "/content/drive/My Drive/quora_data"

glove.840B.300d			paragram_300_sl999     test_data.txt
GoogleNews-vectors-negative300	sample_submission.csv  train.csv
insincere.txt			sincere.txt	       wiki-news-300d-1M
my_submission.csv		test.csv


In [None]:
import pandas as pd 
train_data = pd.read_csv('/content/drive/My Drive/quora_data/train.csv')
test_data = pd.read_csv('/content/drive/My Drive/quora_data/test.csv')

In [None]:
puncts = [
    ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&',
    '/', '[', ']', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£',
    '·', '_', '{', '}', '©', '^', '®', '`', '→', '°', '€', '™', '›',
    '♥', '←', '×', '§', '″', '′', 'Â', '█', 'à', '…', '“', '★', '”',
    '–', '●', 'â', '►', '−', '¢', '¬', '░', '¶', '↑', '±',  '▾',
    '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '⊕', '▼',
    '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
    'è', '¸', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
    '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
    '¹', '≤', '‡', '₹', '´'
]

In [None]:
abbreviations = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "this's": "this is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "here's": "here is",
    "they'd": "they would",
     "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
     "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
    "who'd": "who would",
    "who're": "who are",
    "'re": " are",
    "tryin'": "trying",
    "doesn'": "does not",
    'howdo': 'how do',
    'whatare': 'what are',
    'howcan': 'how can',
    'howmuch': 'how much',
    'howmany': 'how many',
    'whydo': 'why do',
    'doI': 'do I',
    'theBest': 'the best',
    'howdoes': 'how does',
}

In [None]:
def clean(df):
    df = clean_lower(df)
    #df = clean_unicode(df)
    df = clean_math(df)
    df = clean_abbreviation(df, abbreviations)
    #df = clean_spells(df, spells)
    #df = clean_language(df)
    df = clean_puncts(df, puncts)
    df = clean_space(df)
    return df

In [None]:
def clean_lower(df):
    df["question_text"] = df["question_text"].apply(lambda x: x.lower())
    return df

def clean_puncts(df, puncts):
    df['question_text'] = df['question_text'].apply(lambda x: _clean_puncts(x, puncts))
    return df
    
def _clean_puncts(x, puncts):
    x = str(x)
    # added space around puncts after replace
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x
def clean_abbreviation(df, abbreviations):
    compiled_abbreviation = re.compile('(%s)' % '|'.join(abbreviations.keys()))
    def replace(match):
        return abbreviations[match.group(0)]
    df['question_text'] = df["question_text"].apply(
        lambda x: _clean_abreviation(x, compiled_abbreviation, replace)
    )
    return df
    
def _clean_abreviation(x, compiled_re, replace):
    return compiled_re.sub(replace, x)

def clean_space(df):
    compiled_re = re.compile(r"\s+")
    df['question_text'] = df["question_text"].apply(lambda x: _clean_space(x, compiled_re))
    return df
def _clean_space(x, compiled_re):
    return compiled_re.sub(" ", x)

In [None]:
def clean_math(df):
    math_puncts = 'θπα÷⁴≠β²¾∫≥⇒¬∠＝∑Φ√½¼'
    math_puncts_long = [r'\\frac', r'\[math\]', r'\[/math\]', r'\\lim']
    compiled_math = re.compile('(%s)' % '|'.join(math_puncts))
    compiled_math_long = re.compile('(%s)' % '|'.join(math_puncts_long))
    df['question_text'] = df['question_text'].apply(lambda x: _clean_math(x, compiled_math_long))
    df['question_text'] = df['question_text'].apply(lambda x: _clean_math(x, compiled_math))
    return df

def _clean_math(x, compiled_re):
    return compiled_re.sub(' <math> ', x)

In [None]:
from multiprocessing import Pool
import re

num_cores = 2
def df_parallelize_run(df, func, num_cores=2):
    df_split = np.array_split(df, num_cores)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
train_data = df_parallelize_run(train_data, clean)
test_data = df_parallelize_run(test_data, clean)

In [None]:
print("Train shape : ", train_data.shape)
print("Test shape : ", test_data.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [None]:
train_data.columns

Index(['qid', 'question_text', 'target'], dtype='object')

In [None]:
train_data= train_data.drop(['qid'], axis=1)
test_data= test_data.drop(['qid'], axis=1)

In [None]:
train_data.isnull().sum()

question_text    0
target           0
dtype: int64

In [None]:
test_data.isnull().sum()

question_text    0
dtype: int64

In [None]:
#from collections import defaultdict
train1_data = train_data[train_data["target"]==1]
train0_data = train_data[train_data["target"]==0]

In [None]:
sincere_data = train0_data.drop(["target"], axis=1)
insincere_data = train1_data.drop(["target"], axis=1)
type(sincere_data)

pandas.core.frame.DataFrame

#Split data to two classes:


> 1. Sincere data >>> 0
2. Insincere data >>> 1 



In [None]:
#Sincere_class
#with open('sincere.txt', "w") as f3:
#      [ f3.write((row['question_text'])+'\n') for index, row in sincere_data.iterrows()]
#f3.close()

#Insincere_class
#with open('insincere.txt', "w") as f4:
#      [ f4.write((row['question_text'])+'\n') for index, row in insincere_data.iterrows()]
#f4.close()

In [None]:
#!head "/content/drive/My Drive/quora_data/test_data.txt"

In [None]:
!head '/content/drive/My Drive/quora_data/sincere.txt'

how did quebec nationalists see their province as a nation in the 1960s ? 
do you have an adopted dog , how would you encourage people to adopt and not shop ? 
why does velocity affect time ? does velocity affect space geometry ? 
how did otto von guericke used the magdeburg hemispheres ? 
can i convert montra helicon d to a mountain bike by just changing the tyres ? 
is gaza slowly becoming auschwitz , dachau or treblinka for palestinians ? 
why does quora automatically ban conservative opinions when reported , but does not do the same for liberal views ? 
is it crazy if i wash or wipe my groceries off ? germs are everywhere . 
is there such a thing as dressing moderately , and if so , how is that different than dressing modestly ? 
is it just me or have you ever been in this phase wherein you became ignorant to the people you once loved , completely disregarding their feelings / lives so you get to have something go your way and feel temporarily at ease . how did things change ? 


In [None]:
!head '/content/drive/My Drive/quora_data/insincere.txt'

has the united states become the largest dictatorship in the world ? 
which babies are more sweeter to their parents ? dark skin babies or light skin babies ? 
if blacks support school choice and mandatory sentencing for criminals why do not they vote republican ? 
i am gay boy and i love my cousin ( boy ) . he is sexy , but i dont know what to do . he is hot , and i want to see his di * * . what should i do ? 
which races have the smallest penis ? 
why do females find penises ugly ? 
how do i marry an american woman for a green card ? how much do they charge ? 
why do europeans say they are the superior race , when in fact it took them over 2 , 000 years until mid 19th century to surpass china ' s largest economy ? 
did julius caesar bring a tyrannosaurus rex on his campaigns to frighten the celts into submission ? 
in what manner has republican backing of ' states rights ' been hypocritical and what ways have they actually restricted the ability of states to make their own laws ? 


In [None]:
#!cp -R /content/sincere.txt "/content/drive/My Drive/quora_data"
#!cp -R /content/insincere.txt "/content/drive/My Drive/quora_data"


In [None]:
! head "/content/drive/My Drive/quora_data/sincere.txt" -n 20000 > sincere_small.txt


In [None]:
! head "/content/drive/My Drive/quora_data/insincere.txt" -n 50000 > insincere_small.txt


In [None]:
DIRECTORY_URL = '/content/'
FILE_NAMES = ['sincere_small.txt', 'insincere_small.txt']

for name in FILE_NAMES:
  text_dir = DIRECTORY_URL+name
  print(text_dir)
  
parent_dir = os.path.dirname(text_dir)

parent_dir


/content/sincere_small.txt
/content/insincere_small.txt


'/content'

In [None]:
def labeler(example, index):
  return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
  lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

In [None]:
labeled_data_sets

[<MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 <MapDataset shapes: ((), ()), types: (tf.string, tf.int64)>]

In [None]:
BUFFER_SIZE = 50000
#BUFFER_SIZE = 5000
BATCH_SIZE = 64
#TAKE_SIZE = 5000
TAKE_SIZE = 2000

In [None]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [None]:
for ex in all_labeled_data.take(5):
  print(ex)

(<tf.Tensor: id=10697273, shape=(), dtype=string, numpy=b'what is the minimum user base required for quora to launch in another language ? '>, <tf.Tensor: id=10697274, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=10697275, shape=(), dtype=string, numpy=b'why most of guys lie to get sex from girls ? '>, <tf.Tensor: id=10697276, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=10697277, shape=(), dtype=string, numpy=b'why is a man with no testicles frowned upon by both men & women ? '>, <tf.Tensor: id=10697278, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=10697279, shape=(), dtype=string, numpy=b'what should i do as i want to travel japan ? '>, <tf.Tensor: id=10697280, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=10697281, shape=(), dtype=string, numpy=b'why do iranian people get offended when they are mistaken as arab ? do not they speak arabic language ? '>, <tf.Tensor: id=10697282, shape=(), dtype=int64, numpy=1>)


In [None]:
type(all_labeled_data)

tensorflow.python.data.ops.dataset_ops.ShuffleDataset

In [None]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size 

39749

Note: The Tokenization process takes long time

In [None]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [None]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b'what is the minimum user base required for quora to launch in another language ? '


In [None]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[13200, 14300, 26682, 34874, 22165, 31995, 19065, 20070, 15830, 22073, 24087, 2332, 18968, 35668]


In [None]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

all_encoded_data = all_labeled_data.map(encode_map_fn)

#Preparing Test data

In [None]:
lineList = [line.rstrip('\n') for line in open('/content/drive/My Drive/quora_data/test_data.txt')]

In [None]:
len(lineList)

375806

In [None]:
def pad_to_size(vec, size):
  zeros = [0] * (size - len(vec))
  vec.extend(zeros)
  return vec

In [None]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

val_data = all_encoded_data.take(TAKE_SIZE)
val_data = val_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

In [None]:
sample_text, sample_labels = next(iter(val_data))

sample_labels.numpy()

array([0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0])

In [None]:
vocab_size += 1

# First Model using RNN (Recurrent neural networks)
 
> 1. One LSTM layer.
2. Two LSTM layer.



In [None]:
model = tf.keras.Sequential()

In [None]:
model.add(tf.keras.layers.Embedding(vocab_size, 64))

In [None]:
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [None]:
 # One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(2, activation='softmax'))

In [None]:
model.summary()


Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 64)          2544000   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_13 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_14 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_15 (Dense)             (None, 2)                 130       
Total params: 2,622,594
Trainable params: 2,622,594
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history= model.fit(train_data, epochs=3, validation_data=val_data)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
eval_loss, eval_acc = model.evaluate(val_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))


Eval loss: 0.392, Eval accuracy: 0.865


# Second Model (Two LSTM layers):

In [None]:
model_2= tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(2, activation='sigmoid')
])

In [None]:
model_2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model_2.fit(train_data, epochs=3,
                    validation_data=val_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
eval_loss, eval_acc = model_2.evaluate(val_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))


Eval loss: 0.329, Eval accuracy: 0.877


In [None]:
def prepare_test(sen, pad=True):
  encoded_sample_pred_text = encoder.encode(sen)
  if pad:
    encoded_sample_pred_text = pad_to_size(encoded_sample_pred_text, 64)
    
  encoded_sample_pred_text = tf.cast(encoded_sample_pred_text, tf.int64)
  return (encoded_sample_pred_text)

In [None]:
test_data = [prepare_test(sen=lineList[i], pad=True) for i in range(len(lineList))]

In [None]:
def get_label(prediction):
  if prediction[0][0] > prediction[0][1]:
    return 0
  else:
    return 1 

In [None]:
def predict_senetneces_v2(sen, pad):
  encoded_sample_pred_text = encoder.encode(sen)
  if pad:
    encoded_sample_pred_text = pad_to_size(encoded_sample_pred_text, 64)
    
  encoded_sample_pred_text = tf.cast(encoded_sample_pred_text, tf.int64)
  predictions = model.predict(tf.expand_dims(encoded_sample_pred_text, 0))

  return (predictions)

In [None]:
test1= "why does velocity affect time?"
pred_test= predict_senetneces_v2(test1, pad= False)
pred_test

array([[0.9776342 , 0.02236583]], dtype=float32)

In [None]:
get_label(pred_test)

0

In [None]:
test2= "White people and black people "
pred_test2= predict_senetneces_v2(test2, pad= False)
pred_test2

In [None]:
get_label(pred_test2)

1

In [None]:
predictions = [predict_senetneces_v2(sen=lineList[i], pad=True) for i in range(len(lineList[:10000]))]

In [None]:
predictions[1]

In [None]:
pred_label=[get_label(prediction=prediction) for prediction in predictions]

In [None]:
sample_text, sample_labels = next(iter(val_data))

sample_labels.numpy()

array([0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0])

In [None]:
y_val=[]
for val in sample_labels.numpy():
    y_val.append(val)


In [None]:
def get_label2(prediction):
  if prediction[0] > prediction[1]:
    return 0
  else:
    return 1 

In [None]:
y_pred= model.predict(val_data)

In [None]:
y_pred=[get_label2(y) for y in y_pred]

In [None]:
print(y_pred[:100])

[0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]


In [None]:
from sklearn.metrics import f1_score

In [None]:
score = f1_score(y_pred[:64], y_val)
score

0.9411764705882352