# Imports and installs

In [1]:
!pip install sentence-transformers



In [2]:
import nltk
from nltk.corpus import wordnet
import pandas as pd
import random
import numpy as np
import sklearn.decomposition
import matplotlib.pyplot as plt
import pickle
from collections import defaultdict
import string
import re
from gensim.models import KeyedVectors
from nltk import word_tokenize
from tabulate import tabulate

from sklearn.model_selection import train_test_split #split data into train and test sets
from sklearn.feature_extraction.text import CountVectorizer #convert text comment into a numeric vector
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer #use TF IDF transformer to change text vector created by count vectorizer
from sklearn.svm import SVC# Support Vector Machine
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from tensorflow import keras
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU,Conv1D,MaxPooling1D, Flatten, GlobalMaxPooling1D
from keras.models import Model,Sequential, load_model
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers


In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Feature vector generating functions

###Extra Feature SVM models

In [79]:
def preprocess_text(s):
    s = s.replace('\n',' ')
    s = s.replace('\t',' ')
    s = s.replace(':',' ')
    s = s.replace('#',' ')
    s = s.replace('*','u')
    s = s.replace('@','a')
    s = s.replace('$','s')
    s = s.replace('7','s')
    s = s.replace('2','to')
    s = s.replace('8','ight')
    s = s.replace('&', 'and')
    s = s.translate(str.maketrans('', '', string.punctuation) ) 
    s = s.split()
    s = [i for i in s if i]
    s = [re.sub("[^0-9a-zA-Z]+", "", i) for i in s]
    s = [i for i in s if len(i)>1]
    s = " ".join(s)
    s = s.split()
    
    return " ".join(s)

def preprocess_text_for_features(s):
    s = s.replace('\n',' ')
    s = s.replace('\t',' ')
    s = s.replace('7','s')
    s = s.replace('2','to')
    s = s.replace('8','ight')
    s = s.split()
    s = [i for i in s if i]
    s = " ".join(s)
    s = s.split()
    return " ".join(s)

def transform_x_for_features(df):
    x = df.apply(lambda row : preprocess_text_for_features(row['comment_text']), axis=1)
    return pd.DataFrame(x,columns=['comment_text'])

def transform_x(df):
    x = df.apply(lambda row : preprocess_text(row['comment_text']), axis=1)
    return pd.DataFrame(x,columns=['comment_text'])

def merge(df1,df2):
    return pd.concat([df1, df2], axis=1)

def drop_faulty_rows(df):
    return df.drop(df[(df['toxic'] == -1.0) & (df['severe_toxic'] == -1.0) & 
                    (df['obscene'] == -1.0) & (df['threat'] == -1.0) & 
                    (df['insult'] == -1.0) & (df['identity_hate'] == -1.0) ].index)
    
def combine_labels(train_df):
    x = np.where(train_df['toxic']+train_df['severe_toxic']+train_df['obscene']
             +train_df['threat']+train_df['insult']+train_df['identity_hate'] > 0, 1, 0)
    return pd.DataFrame(x,columns=['Toxic'])

w2v_whole_data = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/data/custom_glove_768d.txt')
n_dim = 768

def get_word_vec(word):
    try:
         return w2v_whole_data.word_vec(word)
    except:
        return np.zeros(n_dim) 
vect_get_word_vec = np.vectorize(get_word_vec)

def get_sentence_embed(sent):
    words = np.array(sent.split())
    if len(words)==0:
        return np.zeros(n_dim)
    word_vecs = np.array([vect_get_word_vec(x) for x in words])
    return np.average(word_vecs,axis=0)

def get_sentence_embed_tf_idf(sent):
    global tf_idf_dict
    words = np.array(sent.split())
    if len(words) == 0:
        return np.zeros(n_dim)
    word_vecs = np.array([vect_get_word_vec(x) for x in words])
    for i in range(len(words)):
        word_vecs[i] = tf_idf_dict[words[i]]*word_vecs[i]
    return np.average(word_vecs,axis=0)

stop_words = set(nltk.corpus.stopwords.words('english'))  

def get_sentence_embed_tf_idf_with_features(sent):
  global tf_idf_dict
  words = np.array(sent.split())
  if len(words) == 0:
      return np.zeros(n_dim)
  word_vecs = np.array([vect_get_word_vec(x) for x in words])
  for i in range(len(words)):
      word_vecs[i] = tf_idf_dict[words[i]]*word_vecs[i]
  vec =  np.average(word_vecs,axis=0).tolist()

  #Total Length of sentences
  vec.append(len(sent))

  #Total number of words
  vec.append(len(words))

  #Number of captial characters
  caps = 0
  for ch in sent:
    if ch.isupper():
      caps += 1
  vec.append(caps)

  #Number of all capital words
  caps = 0
  for ch in words:
    if ch.isupper():
      caps += 1
  vec.append(caps)
  
  #Number of exclamation marks 
  vec.append(sent.count('!'))
  

  #Number of question marks 
  vec.append(sent.count('?'))


  #Number of punctuations marks 
  vec.append(sent.count('.')+sent.count(',')+sent.count(';')+sent.count(':')) 

  #Number of symbols *&$% marks 
  vec.append(sent.count('*')+sent.count('&')+sent.count('$')+sent.count('%')) 

  #Number of words inside quotes single or double:
  x = re.findall('"([^"]*)"', sent)
  y = re.findall("'([^']*)'", sent)
  vec.append(len(x)+len(y))

  #Number of sentences:
  vec.append(len(nltk.sent_tokenize(sent)))

  #Count the number of unique words
  vec.append(len(set(words)))

  #Count of hashtags
  vec.append(len(re.findall(r'(#[A-Za-z0-9]*)', sent)))


  #Count of mentions
  vec.append(len(re.findall(r'(@[A-Za-z0-9]*)', sent)))


  #Count of stopwords
  vec.append(len([w for w in words if w in stop_words]))
  

  #Calculating average word length
  vec.append(len(sent)/len(words))

  #Calculating average sentence length
  vec.append(len(words)/len(nltk.sent_tokenize(sent)))
  

  #unique words vs word count feature
  vec.append(len(set(words))/len(words))
  

  #Stopwords count vs words counts feature
  vec.append(len([w for w in words if w in stop_words])/len(words))
  
  return np.array(vec)

def feature_normalise_param(X):
  res = []
  Y = X
  #Total Length of sentences
  res.append(X['comment_text'].apply(len).max())

  #Total number of words
  res.append(X['comment_text'].apply(lambda x: len(x.split())).max())

  #Number of captial characters
  res.append(X['comment_text'].apply(lambda x: len(re.findall("([A-Z])",x))).max())
  
  #Number of all capital words
  res.append(X['comment_text'].apply(lambda x: len([1 for y in x.split() if y.isupper()])).max())
  
  #Number of exclamation marks 
  res.append(X['comment_text'].apply(lambda x: x.count('!')).max())

  #Number of question marks 
  res.append(X['comment_text'].apply(lambda x: x.count('?')).max())

  #Number of punctuations marks 
  res.append(X['comment_text'].apply(lambda x: x.count('.') + x.count(',') + x.count(';') + x.count(':')).max())

  #Number of symbols *&$% marks 
  res.append(X['comment_text'].apply(lambda x: x.count('*') + x.count('&') + x.count('$') + x.count('%')).max())

  #Number of words inside quotes single or double:
  res.append(X['comment_text'].apply(lambda x: len(re.findall('"([^"]*)"', x)) + len(re.findall("'([^']*)'", x))).max())

  #Number of sentences:
  res.append(X['comment_text'].apply(lambda x: len(nltk.sent_tokenize(x))).max())

  #Count the number of unique words
  res.append(X['comment_text'].apply(lambda x: len(set(x.split()))).max())
  
  #Count of hashtags
  res.append(X['comment_text'].apply(lambda x: len(re.findall(r'(#[A-Za-z0-9]*)', x))).max())

  #Count of mentions
  res.append(X['comment_text'].apply(lambda x: len(re.findall(r'(@[A-Za-z0-9]*)', x))).max())

  #Count of stopwords
  res.append(X['comment_text'].apply(lambda x: len([w for w in x.split() if w in stop_words])).max())
  
  return res

def get_sentence_embed_tf_idf_with_features_norm(sent,maxs):
  global tf_idf_dict
  words = np.array(sent.split())
  if len(words) == 0:
      return np.zeros(n_dim)
  word_vecs = np.array([vect_get_word_vec(x) for x in words])
  for i in range(len(words)):
      word_vecs[i] = tf_idf_dict[words[i]]*word_vecs[i]
  vec =  np.average(word_vecs,axis=0).tolist()


  #Total Length of sentences
  vec.append(len(sent)/maxs[0])

  #Total number of words
  vec.append(len(words)/maxs[1])

  #Number of captial characters
  caps = 0
  for ch in sent:
    if ch.isupper():
      caps += 1
  vec.append(caps/maxs[2])

  #Number of all capital words
  caps = 0
  for ch in words:
    if ch.isupper():
      caps += 1
  vec.append(caps/maxs[3])
  
  #Number of exclamation marks 
  vec.append(sent.count('!')/maxs[4])
  
  #Number of question marks 
  vec.append(sent.count('?')/maxs[5])

  #Number of punctuations marks 
  vec.append((sent.count('.')+sent.count(',')+sent.count(';')+sent.count(':'))/maxs[6]) 

  #Number of symbols *&$% marks 
  vec.append((sent.count('*')+sent.count('&')+sent.count('$')+sent.count('%'))/maxs[7]) 

  #Number of words inside quotes single or double:
  x = re.findall('"([^"]*)"', sent)
  y = re.findall("'([^']*)'", sent)
  vec.append((len(x)+len(y))/maxs[8])

  #Number of sentences:
  vec.append(len(nltk.sent_tokenize(sent))/maxs[9])

  #Count the number of unique words
  vec.append(len(set(words))/maxs[10])

  #Count of hashtags
  vec.append(len(re.findall(r'(#[A-Za-z0-9]*)', sent))/maxs[11])

  #Count of mentions
  vec.append(len(re.findall(r'(@[A-Za-z0-9]*)', sent))/maxs[12])

  #Count of stopwords
  vec.append(len([w for w in words if w in stop_words])/maxs[13])
  
  #Calculating average word length
  vec.append(len(sent)/len(words))

  #Calculating average sentence length
  vec.append(len(words)/len(nltk.sent_tokenize(sent)))
  

  #unique words vs word count feature
  vec.append(len(set(words))/len(words))
  

  #Stopwords count vs words counts feature
  vec.append(len([w for w in words if w in stop_words])/len(words))
  
  return np.array(vec)


In [80]:
def vectorise_tf_idf_1(sent):
  sent = preprocess_text_for_features(sent)
  return np.reshape(np.stack(get_sentence_embed_tf_idf(sent), axis=0), (1, -1))

def vectorise_tf_idf_with_features_1(sent):
  sent = preprocess_text_for_features(sent)
  return np.reshape(np.stack(get_sentence_embed_tf_idf_with_features(sent), axis=0), (1, -1))

norm_list=[5233, 1411, 4960, 1352, 4942, 209, 682, 148, 235, 683, 816, 72, 128, 887]
def vectorise_tf_idf_with_features_norm_1(sent):
  sent = preprocess_text_for_features(sent)
  return np.reshape(np.stack(get_sentence_embed_tf_idf_with_features_norm(sent,norm_list), axis=0), (1, -1))

In [7]:
train_df = pd.read_csv('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/train.csv')
train_df.head(5)
X = transform_x(train_df)
X.head()
tf_idf = TfidfVectorizer()
tf_idf.fit(X['comment_text'])
max_idf = max(tf_idf.idf_)
tf_idf_dict = defaultdict(
            lambda: max_idf,
            [(w, tf_idf.idf_[i]) for w, i in tf_idf.vocabulary_.items()])

### SVM with Custom Embedding

In [81]:
w2v = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/data/custom_glove_768d.txt')
n_dim = 768

def get_word_vec_2(word):
    try:
         return w2v.word_vec(word)
    except:
        return np.zeros(n_dim) 
vect_get_word_vec_2 = np.vectorize(get_word_vec_2)


def get_sentence_embed_2(sent):
    words = np.array(sent.split())
    if len(words)==0:
        return np.zeros(n_dim)
    word_vecs = np.array([vect_get_word_vec_2(x) for x in words])
    return np.average(word_vecs,axis=0)

def get_sentence_embed_tf_idf_2(sent):
    global tf_idf_dict
    words = np.array(sent.split())
    if len(words) == 0:
        return np.zeros(n_dim)
    word_vecs = np.array([vect_get_word_vec_2(x) for x in words])
    for i in range(len(words)):
        word_vecs[i] = tf_idf_dict[words[i]]*word_vecs[i]
    return np.average(word_vecs,axis=0)


In [11]:
def vectorise_2(sent):
  sent = preprocess_text(sent)
  return np.reshape(np.stack(get_sentence_embed_2(sent), axis=0), (1, -1))

def vectorise_tf_idf_2(sent):
  sent = preprocess_text(sent)
  return np.reshape(np.stack(get_sentence_embed_tf_idf_2(sent), axis=0), (1, -1))


### SVM using Doc_2_vec

In [12]:
from gensim.models.doc2vec import Doc2Vec
doc2vec = Doc2Vec.load("/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/data/doc2vec_model")

def vectorise_doc2vec_3(sent):
  sent = preprocess_text(sent)
  return np.reshape(np.stack(doc2vec.infer_vector(word_tokenize(sent)), axis=0), (1, -1))


### SVM with Count Vectoriser

In [15]:
def vectorise_doc2vec_4(sent):
  sent = preprocess_text(sent)
  return np.array([sent])

### SVM using NN

In [16]:
def custom_predict_5(classifier, x_t, batch_size=32, extractor=None, kernel_approximation=None, reshapeLayer=None):
    y_pred = np.array([])
    def pred_batch(iterable, n=1):
        l = len(iterable)
        for ndx in range(0, l, n):
            yield iterable[ndx:min(ndx + n, l)]
    i = 0
    for batch_range in pred_batch(range(0, len(x_t)), batch_size):
        if extractor is not None:
            x_t_b = extractor(x_t[batch_range.start: batch_range.stop]).numpy()
            if kernel_approximation is not None:
                x_t_b = feature_map_nystroem.fit_transform(x_t_b)
            '''
            reshape if extractor is gives more than 2 dims (like in conv1d)
            '''
            if reshapeLayer is not None:
                x_t_b = x_t_b.reshape(x_t_b.shape[0],-1)
        else:
            x_t_b = x_t[batch_range.start: batch_range.stop]
        y_t = classifier.predict(x_t_b)
        y_pred = np.hstack((y_pred,y_t)).ravel()
    return y_pred

In [17]:
tokenizer = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/jigsaw-toxic-comment-classification-challenge/data/tokenizer.pkl','rb'))
maxlen = 500
# for samples < n_components of kernel approximation Nystroem cannot be done
def prediction_5(model, sent, extractor, reshapeLayer=None):
    sents = pad_sequences(tokenizer.texts_to_sequences([sent]), maxlen=
                          maxlen,padding='post')
    sent_pred = custom_predict_5(model, sents, extractor=extractor, reshapeLayer = reshapeLayer)
    if sent_pred[0] == 1: return 'Toxic'
    return 'Not Toxic'


def sgd_prediction_5(model, sent):
    sents = pad_sequences(tokenizer.texts_to_sequences([sent]), maxlen=
                          maxlen,padding='post')
    res = model.predict(sents)
    if res[0] == 1: return 'Toxic'
    return 'Not Toxic'


# Model 1-Flatten
model1_flatten = load_model('/content/drive/MyDrive/NLP_Project_IITB/Project/training_models/text_feature_extractor-ep003-loss0.202-acc0.899-val_loss0.202-val_acc0.899.hdf5')
layer_name = "flatten"
extractor_model1_flatten = Model(inputs=model1_flatten.inputs,
                        outputs=model1_flatten.get_layer(layer_name).output)
model1_flatten_svm = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/training_models/svm-sgd-model1-no-kernel-extractor-flatten.pkl','rb'))


# Model 1-Oversampled
model1_oversampled_flatten = load_model('/content/drive/MyDrive/NLP_Project_IITB/Project/training_models/over-sampled-text_feature_extractor-ep001-loss0.709-acc0.645-val_loss0.203-val_acc0.898.hdf5')
layer_name = 'flatten'
extractor_model1_oversampled_flatten = Model(inputs=model1_oversampled_flatten.inputs,
                        outputs=model1_oversampled_flatten.get_layer(layer_name).output)
model1_oversampled_flatten_svm = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/training_models/svm-sgd-model1-oversampled-no-kernel-extractor-flatten.pkl','rb'))

 
# Model 3-Flatten
model3_flatten = load_model('/content/drive/MyDrive/NLP_Project_IITB/Project/training_models/Model-Binary-loss-gpu-ep002-loss0.013-acc0.985-val_loss0.277-val_acc0.934.hdf5')
layer_name = "flatten"
extractor_model3_flatten = Model(inputs=model3_flatten.inputs,
                        outputs=model3_flatten.get_layer(layer_name).output)
model3_flatten_svm = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/training_models/svm-sgd-model3-extractor-flatten.pkl','rb'))


#Model 3-Conv1d
model3_conv1d = load_model('/content/drive/MyDrive/NLP_Project_IITB/Project/training_models/Model-Binary-loss-gpu-ep002-loss0.013-acc0.985-val_loss0.277-val_acc0.934.hdf5')
layer_name = "conv1D"
extractor_model3_conv1d = Model(inputs=model3_conv1d.inputs,
                        outputs=model3_conv1d.get_layer(layer_name).output)
model3_conv1d_svm =  pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/training_models/svm-sgd-model3-extractor-conv1D.pkl','rb'))

# SGD Classifier
sgd_classifier = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_no-oversampling.pkl','rb'))

# SGD Oversampled Classifier
sgd_classifier_oversample = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_oversampled.pkl','rb'))



### SVM Models using Sentence Embeddings

In [18]:
with open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sbert_model.pkl','rb') as handle: 
    sbert_model = pickle.load(handle) 
def vectoriser_6(sent): 
    return sbert_model.encode([preprocess_text(sent)]) 

# Loading all the SVM pre-trained Models

In [19]:
#---Extra Features SVM Models---
SVM_rbf_custom_embedding_tf_idf_without_features = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/models/SVM_tf_idf.pkl','rb'))
SVM_rbf_custom_embedding_tf_idf_with_features = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/models/SVM_tf_idf_features.pkl','rb'))
SVM_rbf_custom_embedding_tf_idf_with_features_normalised = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/models/SVM_tf_idf_features_norm.pkl','rb'))

#---Custom Embeddings SVM Models---
linear_svm_avg_custom_word2vec_embedding = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/lsvm_emb_on_Train_avged','rb'))
linear_svm_avg_tfidf_custom_word2vec_embedding = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/lsvm_emb_on_Train_tfidf_avged','rb'))
svm_rbf_avg_custom_word2vec_embedding = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm_rbf_emb_on_Train_avged','rb'))
sgd_classifier_avg_custom_word2vec_embedding = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_emb_on_Train_avged','rb'))
sgd_classifier_avg_tfidf_custom_word2vec_embedding = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_emb_on_Train_tfidf_avged','rb'))

#---Doc2Vec Embeddings SVM Models---
svm_rbf_doc2vec_custom_embedding = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm_rbf_d2v_emb.pkl','rb'))
sgd_doc2vec_custom_embedding = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_d2v_emb.pkl','rb'))

#---Count Vectoriser SVM Models---
linear_svm_countVectorizer_tfdif = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/linearSVM_countvec_tfidf.pkl','rb'))
linear_svm_countVectorizer_tfdif_nystroem_approximation = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/'+'nystroem_n_comp{}'.format(1000),'rb'))
linear_svm_countVectorizer_tfdif_fourier_approximation = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/'+'fourier_n_comp{}'.format(1000),'rb'))
svm_rbf_countVectorizer_tfdif = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm-test-train-count_vec_pipeline.pkl','rb'))
sgd_svm_countVectorizer_tfdif = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgdSVM-test-train-count_vec_pipeline.pkl','rb'))
sgd_svm_countVectorizer_tfdif_nystroem_approximation = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_nystroem_approx_svm-test-train-count_vec_pipeline.pkl','rb'))
sgd_svm_countVectorizer_tfdif_fourier_approximation = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/sgd_fourier_approx_svm-test-train-count_vec_pipeline.pkl','rb'))

#---SVM Models using bert-base-nli-mean-tokens sentence Embeddings---
SVM_rbf_sentence_embedding = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm-embedding-rbf.pkl','rb'))
SVM_sigmoid_sentence_embedding = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm-embedding-sigmoid.pkl','rb'))
SVM_polynomial_sentence_embedding = pickle.load(open('/content/drive/MyDrive/NLP_Project_IITB/Project/models/svm-embedding-poly.pkl','rb'))


#Prediction function to get output from all the SVM models

In [90]:
def Predict(sent):
  res = []
  toxic_count = 0
  if (SVM_rbf_custom_embedding_tf_idf_without_features.predict(vectorise_tf_idf_1(sent))):
    res.append(["SVM_rbf_custom_embedding_tf_idf_without_features","Toxic"])
    toxic_count += 1
  else:
    res.append(["SVM_rbf_custom_embedding_tf_idf_without_features","Not Toxic"])

  if (SVM_rbf_custom_embedding_tf_idf_with_features.predict(vectorise_tf_idf_with_features_1(sent))):
    res.append(["SVM_rbf_custom_embedding_tf_idf_with_features","Toxic"])
    toxic_count += 1
  else:
    res.append(["SVM_rbf_custom_embedding_tf_idf_with_features","Not Toxic"])

  if (SVM_rbf_custom_embedding_tf_idf_with_features_normalised.predict(vectorise_tf_idf_with_features_norm_1(sent))):
    res.append(["SVM_rbf_custom_embedding_tf_idf_with_features_normalised","Toxic"])
    toxic_count += 1
  else:
    res.append(["SVM_rbf_custom_embedding_tf_idf_with_features_normalised","Not Toxic"])
  
  #--------------------SVM Custom Embedding Models-----------------------#
  if (linear_svm_avg_custom_word2vec_embedding.predict(vectorise_2(sent))):
    res.append(["linear_svm_avg_custom_word2vec_embedding","Toxic"])
    toxic_count += 1
  else:
    res.append(["linear_svm_avg_custom_word2vec_embedding","Not Toxic"])

  if (linear_svm_avg_tfidf_custom_word2vec_embedding.predict(vectorise_tf_idf_2(sent))):
    res.append(["linear_svm_avg_tfidf_custom_word2vec_embedding","Toxic"])
    toxic_count += 1
  else:
    res.append(["linear_svm_avg_tfidf_custom_word2vec_embedding","Not Toxic"])

  if (svm_rbf_avg_custom_word2vec_embedding.predict(vectorise_2(sent))):
    res.append(["svm_rbf_avg_custom_word2vec_embedding","Toxic"])
    toxic_count += 1
  else:
    res.append(["svm_rbf_avg_custom_word2vec_embedding","Not Toxic"])


  if (sgd_classifier_avg_custom_word2vec_embedding.predict(vectorise_2(sent))):
    res.append(["sgd_classifier_avg_custom_word2vec_embedding","Toxic"])
    toxic_count += 1
  else:
    res.append(["sgd_classifier_avg_custom_word2vec_embedding","Not Toxic"])

  if (sgd_classifier_avg_tfidf_custom_word2vec_embedding.predict(vectorise_tf_idf_2(sent))):
    res.append(["sgd_classifier_avg_tfidf_custom_word2vec_embedding","Toxic"])
    toxic_count += 1
  else:
    res.append(["sgd_classifier_avg_tfidf_custom_word2vec_embedding","Not Toxic"])


 #--------------------SVM Doc2Vec Models-----------------------#


  if (svm_rbf_doc2vec_custom_embedding.predict(vectorise_doc2vec_3(sent))):
    res.append(["svm_rbf_doc2vec_custom_embedding","Toxic"])
    toxic_count += 1
  else:
    res.append(["svm_rbf_doc2vec_custom_embedding","Not Toxic"])

  if (sgd_doc2vec_custom_embedding.predict(vectorise_doc2vec_3(sent))):
    res.append(["sgd_doc2vec_custom_embedding","Toxic"])
    toxic_count += 1
  else:
    res.append(["sgd_doc2vec_custom_embedding","Not Toxic"])


  #--------------------SVM Count Vectoriser Models-----------------------#

  if (linear_svm_countVectorizer_tfdif.predict(vectorise_doc2vec_4(sent))):
    res.append(["linear_svm_countVectorizer_tfdif","Toxic"])
    toxic_count += 1
  else:
    res.append(["linear_svm_countVectorizer_tfdif","Not Toxic"])

  if (linear_svm_countVectorizer_tfdif_nystroem_approximation.predict(vectorise_doc2vec_4(sent))):
    res.append(["linear_svm_countVectorizer_tfdif_nystroem_approximation","Toxic"])
    toxic_count += 1
  else:
    res.append(["linear_svm_countVectorizer_tfdif_nystroem_approximation","Not Toxic"])

  if (linear_svm_countVectorizer_tfdif_fourier_approximation.predict(vectorise_doc2vec_4(sent))):
    res.append(["linear_svm_countVectorizer_tfdif_fourier_approximation","Toxic"])
    toxic_count += 1
  else:
    res.append(["linear_svm_countVectorizer_tfdif_fourier_approximation","Not Toxic"])

  if (svm_rbf_countVectorizer_tfdif.predict(vectorise_doc2vec_4(sent))):
    res.append(["svm_rbf_countVectorizer_tfdif","Toxic"])
    toxic_count += 1
  else:
    res.append(["svm_rbf_countVectorizer_tfdif","Not Toxic"])

  if (sgd_svm_countVectorizer_tfdif.predict(vectorise_doc2vec_4(sent))):
    res.append(["sgd_svm_countVectorizer_tfdif","Toxic"])
    toxic_count += 1
  else:
    res.append(["sgd_svm_countVectorizer_tfdif","Not Toxic"])

  if (sgd_svm_countVectorizer_tfdif_nystroem_approximation.predict(vectorise_doc2vec_4(sent))):
    res.append(["sgd_svm_countVectorizer_tfdif_nystroem_approximation","Toxic"])
    toxic_count += 1
  else:
    res.append(["sgd_svm_countVectorizer_tfdif_nystroem_approximation","Not Toxic"])

  if (sgd_svm_countVectorizer_tfdif_fourier_approximation.predict(vectorise_doc2vec_4(sent))):
    res.append(["sgd_svm_countVectorizer_tfdif_fourier_approximation","Toxic"])
    toxic_count += 1
  else:
    res.append(["sgd_svm_countVectorizer_tfdif_fourier_approximation","Not Toxic"])
  
  #--------------------SVM NN Models-----------------------#

  pred = prediction_5(model1_flatten_svm,sent,extractor_model1_flatten)
  if(pred=='Toxic'):
    toxic_count += 1
  res.append(["model1_flatten_svm",pred])
  
  pred = prediction_5(model1_oversampled_flatten_svm,sent,extractor_model1_oversampled_flatten)
  if(pred=='Toxic'):
    toxic_count += 1
  res.append(["model1_oversampled_flatten_svm",pred])
  
  pred = prediction_5(model3_flatten_svm,sent,extractor_model3_flatten)
  if(pred=='Toxic'):
    toxic_count += 1
  res.append(["model3_flatten_svm",pred])
  
  pred = prediction_5(model3_conv1d_svm,sent,extractor_model3_conv1d, reshapeLayer=True)
  if(pred=='Toxic'):
    toxic_count += 1
  res.append(["model3_conv1d_svm",pred])

  #--------------------SGD Model on NN preprocessing setting-----------------------#

  pred = sgd_prediction_5(sgd_classifier,sent)
  if(pred=='Toxic'):
    toxic_count += 1
  res.append(["sgd_classifier_with_NN_preprocessing",pred])
  
  pred = sgd_prediction_5(sgd_classifier_oversample,sent)
  if(pred=='Toxic'):
    toxic_count += 1
    res.append(["sgd_classifier_oversample_with_NN_preprocessing",pred])

  #--------------------SVM Models using Sentence Embeddings-----------------------#

  if (SVM_rbf_sentence_embedding.predict(vectoriser_6(sent))):
    res.append(["SVM_rbf_sentence_embedding","Toxic"])
    toxic_count += 1
  else:
    res.append(["SVM_rbf_sentence_embedding","Not Toxic"])

  if (SVM_sigmoid_sentence_embedding.predict(vectoriser_6(sent))):
    res.append(["SVM_sigmoid_sentence_embedding","Toxic"])
    toxic_count += 1
  else:
    res.append(["SVM_sigmoid_sentence_embedding","Not Toxic"])

  if (SVM_polynomial_sentence_embedding.predict(vectoriser_6(sent))):
    res.append(["SVM_polynomial_sentence_embedding","Toxic"])
    toxic_count += 1
  else:
    res.append(["SVM_polynomial_sentence_embedding","Not Toxic"])

  print(tabulate(res, headers=['Model', 'Prediction']))
  
  
  print("\nTotal Toxic Predictions = ",toxic_count)


#Analysis for ad-hoc sentences

In [91]:
# NN capture even typos of abusive words better
Predict("ffffffuuuuucccckkkkk off")


Model                                                     Prediction
--------------------------------------------------------  ------------
SVM_rbf_custom_embedding_tf_idf_without_features          Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features             Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features_normalised  Not Toxic
linear_svm_avg_custom_word2vec_embedding                  Toxic
linear_svm_avg_tfidf_custom_word2vec_embedding            Toxic
svm_rbf_avg_custom_word2vec_embedding                     Not Toxic
sgd_classifier_avg_custom_word2vec_embedding              Toxic
sgd_classifier_avg_tfidf_custom_word2vec_embedding        Toxic
svm_rbf_doc2vec_custom_embedding                          Not Toxic
sgd_doc2vec_custom_embedding                              Not Toxic
linear_svm_countVectorizer_tfdif                          Not Toxic
linear_svm_countVectorizer_tfdif_nystroem_approximation   Toxic
linear_svm_countVectorizer_tfdif_fourier_approximation    Not To

In [92]:
#Conextual information is not well captured and words by themselves dominate the toxic labels.
Predict("Fuck off is such a derogatory term and should not be used.")


Model                                                     Prediction
--------------------------------------------------------  ------------
SVM_rbf_custom_embedding_tf_idf_without_features          Toxic
SVM_rbf_custom_embedding_tf_idf_with_features             Toxic
SVM_rbf_custom_embedding_tf_idf_with_features_normalised  Toxic
linear_svm_avg_custom_word2vec_embedding                  Toxic
linear_svm_avg_tfidf_custom_word2vec_embedding            Toxic
svm_rbf_avg_custom_word2vec_embedding                     Toxic
sgd_classifier_avg_custom_word2vec_embedding              Toxic
sgd_classifier_avg_tfidf_custom_word2vec_embedding        Toxic
svm_rbf_doc2vec_custom_embedding                          Not Toxic
sgd_doc2vec_custom_embedding                              Not Toxic
linear_svm_countVectorizer_tfdif                          Toxic
linear_svm_countVectorizer_tfdif_nystroem_approximation   Toxic
linear_svm_countVectorizer_tfdif_fourier_approximation    Toxic
svm_rbf_countVectori

In [84]:
#Longer sentences given more weightage.
Predict("You bastard.")
Predict("You bastard. You cann't do anything properly. Sucha useless mess")


Model                                                     Prediction
--------------------------------------------------------  ------------
SVM_rbf_custom_embedding_tf_idf_without_features          Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features             Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features_normalised  Not Toxic
linear_svm_avg_custom_word2vec_embedding                  Toxic
linear_svm_avg_tfidf_custom_word2vec_embedding            Toxic
svm_rbf_avg_custom_word2vec_embedding                     Toxic
sgd_classifier_avg_custom_word2vec_embedding              Toxic
sgd_classifier_avg_tfidf_custom_word2vec_embedding        Toxic
svm_rbf_doc2vec_custom_embedding                          Not Toxic
sgd_doc2vec_custom_embedding                              Not Toxic
linear_svm_countVectorizer_tfdif                          Toxic
linear_svm_countVectorizer_tfdif_nystroem_approximation   Toxic
linear_svm_countVectorizer_tfdif_fourier_approximation    Toxic
svm_rbf_

In [96]:
#Some words that appears with slangs but are not individually obscene or toxic are classified as Toxic.
Predict("Your MOTHER is good.")
Predict("Your SISTER is good.")


Model                                                     Prediction
--------------------------------------------------------  ------------
SVM_rbf_custom_embedding_tf_idf_without_features          Toxic
SVM_rbf_custom_embedding_tf_idf_with_features             Toxic
SVM_rbf_custom_embedding_tf_idf_with_features_normalised  Toxic
linear_svm_avg_custom_word2vec_embedding                  Toxic
linear_svm_avg_tfidf_custom_word2vec_embedding            Toxic
svm_rbf_avg_custom_word2vec_embedding                     Toxic
sgd_classifier_avg_custom_word2vec_embedding              Toxic
sgd_classifier_avg_tfidf_custom_word2vec_embedding        Toxic
svm_rbf_doc2vec_custom_embedding                          Toxic
sgd_doc2vec_custom_embedding                              Toxic
linear_svm_countVectorizer_tfdif                          Toxic
linear_svm_countVectorizer_tfdif_nystroem_approximation   Not Toxic
linear_svm_countVectorizer_tfdif_fourier_approximation    Not Toxic
svm_rbf_countVectori

In [86]:
Predict("You are of black race.")
Predict("You are of white race.")
Predict("You are of white trash race.")


Model                                                     Prediction
--------------------------------------------------------  ------------
SVM_rbf_custom_embedding_tf_idf_without_features          Toxic
SVM_rbf_custom_embedding_tf_idf_with_features             Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features_normalised  Not Toxic
linear_svm_avg_custom_word2vec_embedding                  Toxic
linear_svm_avg_tfidf_custom_word2vec_embedding            Toxic
svm_rbf_avg_custom_word2vec_embedding                     Not Toxic
sgd_classifier_avg_custom_word2vec_embedding              Toxic
sgd_classifier_avg_tfidf_custom_word2vec_embedding        Toxic
svm_rbf_doc2vec_custom_embedding                          Not Toxic
sgd_doc2vec_custom_embedding                              Not Toxic
linear_svm_countVectorizer_tfdif                          Not Toxic
linear_svm_countVectorizer_tfdif_nystroem_approximation   Toxic
linear_svm_countVectorizer_tfdif_fourier_approximation    Toxic
svm_

In [87]:
Predict("You are a smart ass.")
Predict("You are smart.")


Model                                                     Prediction
--------------------------------------------------------  ------------
SVM_rbf_custom_embedding_tf_idf_without_features          Toxic
SVM_rbf_custom_embedding_tf_idf_with_features             Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features_normalised  Toxic
linear_svm_avg_custom_word2vec_embedding                  Toxic
linear_svm_avg_tfidf_custom_word2vec_embedding            Toxic
svm_rbf_avg_custom_word2vec_embedding                     Toxic
sgd_classifier_avg_custom_word2vec_embedding              Toxic
sgd_classifier_avg_tfidf_custom_word2vec_embedding        Toxic
svm_rbf_doc2vec_custom_embedding                          Not Toxic
sgd_doc2vec_custom_embedding                              Not Toxic
linear_svm_countVectorizer_tfdif                          Toxic
linear_svm_countVectorizer_tfdif_nystroem_approximation   Toxic
linear_svm_countVectorizer_tfdif_fourier_approximation    Toxic
svm_rbf_countVec

In [88]:
Predict("Donald Trump is a not an asshole.")
Predict("Donald Trump is an asshole.")


Model                                                     Prediction
--------------------------------------------------------  ------------
SVM_rbf_custom_embedding_tf_idf_without_features          Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features             Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features_normalised  Not Toxic
linear_svm_avg_custom_word2vec_embedding                  Toxic
linear_svm_avg_tfidf_custom_word2vec_embedding            Toxic
svm_rbf_avg_custom_word2vec_embedding                     Toxic
sgd_classifier_avg_custom_word2vec_embedding              Toxic
sgd_classifier_avg_tfidf_custom_word2vec_embedding        Toxic
svm_rbf_doc2vec_custom_embedding                          Not Toxic
sgd_doc2vec_custom_embedding                              Not Toxic
linear_svm_countVectorizer_tfdif                          Toxic
linear_svm_countVectorizer_tfdif_nystroem_approximation   Toxic
linear_svm_countVectorizer_tfdif_fourier_approximation    Not Toxic
svm_

In [89]:
Predict("You mother fucking asshole peice of shit dickhead Faggot !!")

Model                                                     Prediction
--------------------------------------------------------  ------------
SVM_rbf_custom_embedding_tf_idf_without_features          Toxic
SVM_rbf_custom_embedding_tf_idf_with_features             Toxic
SVM_rbf_custom_embedding_tf_idf_with_features_normalised  Toxic
linear_svm_avg_custom_word2vec_embedding                  Toxic
linear_svm_avg_tfidf_custom_word2vec_embedding            Toxic
svm_rbf_avg_custom_word2vec_embedding                     Toxic
sgd_classifier_avg_custom_word2vec_embedding              Toxic
sgd_classifier_avg_tfidf_custom_word2vec_embedding        Toxic
svm_rbf_doc2vec_custom_embedding                          Toxic
sgd_doc2vec_custom_embedding                              Toxic
linear_svm_countVectorizer_tfdif                          Toxic
linear_svm_countVectorizer_tfdif_nystroem_approximation   Toxic
linear_svm_countVectorizer_tfdif_fourier_approximation    Toxic
svm_rbf_countVectorizer_tfdi

In [97]:
Predict("I love this airlines for sending me to london and my luggage to Delhi.")

Model                                                     Prediction
--------------------------------------------------------  ------------
SVM_rbf_custom_embedding_tf_idf_without_features          Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features             Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features_normalised  Not Toxic
linear_svm_avg_custom_word2vec_embedding                  Not Toxic
linear_svm_avg_tfidf_custom_word2vec_embedding            Not Toxic
svm_rbf_avg_custom_word2vec_embedding                     Not Toxic
sgd_classifier_avg_custom_word2vec_embedding              Not Toxic
sgd_classifier_avg_tfidf_custom_word2vec_embedding        Not Toxic
svm_rbf_doc2vec_custom_embedding                          Not Toxic
sgd_doc2vec_custom_embedding                              Not Toxic
linear_svm_countVectorizer_tfdif                          Not Toxic
linear_svm_countVectorizer_tfdif_nystroem_approximation   Not Toxic
linear_svm_countVectorizer_tfdif_fourier_app

In [100]:
Predict("Why don't these Bastards leave us in peace?")

Model                                                     Prediction
--------------------------------------------------------  ------------
SVM_rbf_custom_embedding_tf_idf_without_features          Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features             Not Toxic
SVM_rbf_custom_embedding_tf_idf_with_features_normalised  Not Toxic
linear_svm_avg_custom_word2vec_embedding                  Not Toxic
linear_svm_avg_tfidf_custom_word2vec_embedding            Not Toxic
svm_rbf_avg_custom_word2vec_embedding                     Not Toxic
sgd_classifier_avg_custom_word2vec_embedding              Not Toxic
sgd_classifier_avg_tfidf_custom_word2vec_embedding        Toxic
svm_rbf_doc2vec_custom_embedding                          Not Toxic
sgd_doc2vec_custom_embedding                              Not Toxic
linear_svm_countVectorizer_tfdif                          Toxic
linear_svm_countVectorizer_tfdif_nystroem_approximation   Not Toxic
linear_svm_countVectorizer_tfdif_fourier_approximati